summaryrefslogtreecommitdiff
path: root/net
diff options
context:
space:
mode:
authorJiri Kosina <jkosina@suse.cz>2016-04-18 12:18:55 +0300
committerJiri Kosina <jkosina@suse.cz>2016-04-18 12:18:55 +0300
commit9938b04472d5c59f8bd8152a548533a8599596a2 (patch)
tree0fc8318100878c5e446076613ec02a97aa179119 /net
parentbd7ced98812dbb906950d8b0ec786f14f631cede (diff)
parentc3b46c73264b03000d1e18b22f5caf63332547c9 (diff)
downloadlinux-9938b04472d5c59f8bd8152a548533a8599596a2.tar.xz
Merge branch 'master' into for-next
Sync with Linus' tree so that patches against newer codebase can be applied. Signed-off-by: Jiri Kosina <jkosina@suse.cz>
Diffstat (limited to 'net')
-rw-r--r--net/6lowpan/6lowpan_i.h28
-rw-r--r--net/6lowpan/Kconfig47
-rw-r--r--net/6lowpan/Makefile9
-rw-r--r--net/6lowpan/core.c96
-rw-r--r--net/6lowpan/debugfs.c300
-rw-r--r--net/6lowpan/iphc.c413
-rw-r--r--net/6lowpan/nhc_ghc_ext_dest.c27
-rw-r--r--net/6lowpan/nhc_ghc_ext_frag.c28
-rw-r--r--net/6lowpan/nhc_ghc_ext_hop.c27
-rw-r--r--net/6lowpan/nhc_ghc_ext_route.c27
-rw-r--r--net/6lowpan/nhc_ghc_icmpv6.c27
-rw-r--r--net/6lowpan/nhc_ghc_udp.c27
-rw-r--r--net/8021q/vlan.c2
-rw-r--r--net/8021q/vlan_core.c4
-rw-r--r--net/8021q/vlan_dev.c21
-rw-r--r--net/8021q/vlanproc.c3
-rw-r--r--net/8021q/vlanproc.h4
-rw-r--r--net/9p/trans_fd.c88
-rw-r--r--net/9p/trans_rdma.c90
-rw-r--r--net/9p/trans_virtio.c18
-rw-r--r--net/Kconfig33
-rw-r--r--net/Makefile1
-rw-r--r--net/appletalk/ddp.c2
-rw-r--r--net/atm/common.c4
-rw-r--r--net/atm/mpc.h4
-rw-r--r--net/atm/mpoa_caches.c4
-rw-r--r--net/ax25/af_ax25.c3
-rw-r--r--net/ax25/ax25_ip.c15
-rw-r--r--net/batman-adv/Kconfig16
-rw-r--r--net/batman-adv/Makefile5
-rw-r--r--net/batman-adv/bat_algo.h30
-rw-r--r--net/batman-adv/bat_iv_ogm.c193
-rw-r--r--net/batman-adv/bat_v.c347
-rw-r--r--net/batman-adv/bat_v_elp.c515
-rw-r--r--net/batman-adv/bat_v_elp.h33
-rw-r--r--net/batman-adv/bat_v_ogm.c833
-rw-r--r--net/batman-adv/bat_v_ogm.h36
-rw-r--r--net/batman-adv/bitarray.c14
-rw-r--r--net/batman-adv/bitarray.h14
-rw-r--r--net/batman-adv/bridge_loop_avoidance.c382
-rw-r--r--net/batman-adv/bridge_loop_avoidance.h4
-rw-r--r--net/batman-adv/debugfs.c17
-rw-r--r--net/batman-adv/debugfs.h2
-rw-r--r--net/batman-adv/distributed-arp-table.c105
-rw-r--r--net/batman-adv/distributed-arp-table.h2
-rw-r--r--net/batman-adv/fragmentation.c42
-rw-r--r--net/batman-adv/fragmentation.h4
-rw-r--r--net/batman-adv/gateway_client.c132
-rw-r--r--net/batman-adv/gateway_client.h2
-rw-r--r--net/batman-adv/gateway_common.c121
-rw-r--r--net/batman-adv/gateway_common.h4
-rw-r--r--net/batman-adv/hard-interface.c90
-rw-r--r--net/batman-adv/hard-interface.h30
-rw-r--r--net/batman-adv/hash.c2
-rw-r--r--net/batman-adv/hash.h24
-rw-r--r--net/batman-adv/icmp_socket.c10
-rw-r--r--net/batman-adv/icmp_socket.h2
-rw-r--r--net/batman-adv/main.c133
-rw-r--r--net/batman-adv/main.h41
-rw-r--r--net/batman-adv/multicast.c46
-rw-r--r--net/batman-adv/multicast.h4
-rw-r--r--net/batman-adv/network-coding.c175
-rw-r--r--net/batman-adv/network-coding.h2
-rw-r--r--net/batman-adv/originator.c463
-rw-r--r--net/batman-adv/originator.h24
-rw-r--r--net/batman-adv/packet.h71
-rw-r--r--net/batman-adv/routing.c137
-rw-r--r--net/batman-adv/routing.h5
-rw-r--r--net/batman-adv/send.c101
-rw-r--r--net/batman-adv/send.h16
-rw-r--r--net/batman-adv/soft-interface.c73
-rw-r--r--net/batman-adv/soft-interface.h4
-rw-r--r--net/batman-adv/sysfs.c178
-rw-r--r--net/batman-adv/sysfs.h2
-rw-r--r--net/batman-adv/translation-table.c389
-rw-r--r--net/batman-adv/translation-table.h2
-rw-r--r--net/batman-adv/types.h220
-rw-r--r--net/bluetooth/6lowpan.c15
-rw-r--r--net/bluetooth/Kconfig9
-rw-r--r--net/bluetooth/Makefile1
-rw-r--r--net/bluetooth/af_bluetooth.c32
-rw-r--r--net/bluetooth/bnep/core.c7
-rw-r--r--net/bluetooth/cmtp/capi.c8
-rw-r--r--net/bluetooth/cmtp/core.c3
-rw-r--r--net/bluetooth/hci_conn.c140
-rw-r--r--net/bluetooth/hci_core.c677
-rw-r--r--net/bluetooth/hci_event.c14
-rw-r--r--net/bluetooth/hci_request.c1856
-rw-r--r--net/bluetooth/hci_request.h55
-rw-r--r--net/bluetooth/hci_sock.c209
-rw-r--r--net/bluetooth/l2cap_core.c53
-rw-r--r--net/bluetooth/l2cap_sock.c12
-rw-r--r--net/bluetooth/leds.c74
-rw-r--r--net/bluetooth/leds.h16
-rw-r--r--net/bluetooth/mgmt.c1923
-rw-r--r--net/bluetooth/rfcomm/core.c46
-rw-r--r--net/bluetooth/sco.c3
-rw-r--r--net/bluetooth/smp.c158
-rw-r--r--net/bridge/br.c3
-rw-r--r--net/bridge/br_device.c8
-rw-r--r--net/bridge/br_fdb.c16
-rw-r--r--net/bridge/br_forward.c1
-rw-r--r--net/bridge/br_if.c72
-rw-r--r--net/bridge/br_input.c16
-rw-r--r--net/bridge/br_mdb.c145
-rw-r--r--net/bridge/br_multicast.c101
-rw-r--r--net/bridge/br_netfilter_hooks.c68
-rw-r--r--net/bridge/br_netlink.c1
-rw-r--r--net/bridge/br_private.h12
-rw-r--r--net/bridge/br_stp.c44
-rw-r--r--net/bridge/br_stp_if.c12
-rw-r--r--net/bridge/br_stp_timer.c1
-rw-r--r--net/bridge/br_sysfs_br.c3
-rw-r--r--net/bridge/br_vlan.c42
-rw-r--r--net/bridge/netfilter/ebt_ip6.c4
-rw-r--r--net/bridge/netfilter/ebt_log.c9
-rw-r--r--net/bridge/netfilter/ebt_stp.c2
-rw-r--r--net/bridge/netfilter/ebt_vlan.c15
-rw-r--r--net/bridge/netfilter/ebtable_filter.c2
-rw-r--r--net/bridge/netfilter/ebtable_nat.c2
-rw-r--r--net/bridge/netfilter/ebtables.c143
-rw-r--r--net/bridge/netfilter/nf_tables_bridge.c2
-rw-r--r--net/bridge/netfilter/nft_meta_bridge.c1
-rw-r--r--net/bridge/netfilter/nft_reject_bridge.c24
-rw-r--r--net/caif/caif_socket.c4
-rw-r--r--net/caif/cfpkt_skbuff.c2
-rw-r--r--net/caif/cfrfml.c2
-rw-r--r--net/ceph/auth_x.c85
-rw-r--r--net/ceph/auth_x.h2
-rw-r--r--net/ceph/ceph_common.c22
-rw-r--r--net/ceph/crush/mapper.c33
-rw-r--r--net/ceph/crypto.c101
-rw-r--r--net/ceph/crypto.h4
-rw-r--r--net/ceph/debugfs.c17
-rw-r--r--net/ceph/messenger.c235
-rw-r--r--net/ceph/mon_client.c461
-rw-r--r--net/ceph/osd_client.c157
-rw-r--r--net/ceph/osdmap.c19
-rw-r--r--net/ceph/pagelist.c4
-rw-r--r--net/ceph/pagevec.c32
-rw-r--r--net/core/Makefile5
-rw-r--r--net/core/datagram.c79
-rw-r--r--net/core/dev.c497
-rw-r--r--net/core/devlink.c738
-rw-r--r--net/core/dst.c13
-rw-r--r--net/core/dst_cache.c168
-rw-r--r--net/core/ethtool.c723
-rw-r--r--net/core/filter.c567
-rw-r--r--net/core/flow_dissector.c74
-rw-r--r--net/core/gen_estimator.c2
-rw-r--r--net/core/gen_stats.c1
-rw-r--r--net/core/hwbm.c87
-rw-r--r--net/core/lwtunnel.c37
-rw-r--r--net/core/neighbour.c6
-rw-r--r--net/core/net-sysfs.c23
-rw-r--r--net/core/net-traces.c4
-rw-r--r--net/core/netclassid_cgroup.c40
-rw-r--r--net/core/netprio_cgroup.c29
-rw-r--r--net/core/pktgen.c12
-rw-r--r--net/core/rtnetlink.c421
-rw-r--r--net/core/scm.c13
-rw-r--r--net/core/skbuff.c203
-rw-r--r--net/core/sock.c173
-rw-r--r--net/core/sock_diag.c23
-rw-r--r--net/core/sock_reuseport.c258
-rw-r--r--net/core/stream.c8
-rw-r--r--net/core/sysctl_net_core.c10
-rw-r--r--net/dccp/ipv4.c18
-rw-r--r--net/dccp/ipv6.c55
-rw-r--r--net/dccp/output.c2
-rw-r--r--net/dccp/proto.c3
-rw-r--r--net/decnet/af_decnet.c11
-rw-r--r--net/dns_resolver/dns_query.c2
-rw-r--r--net/dsa/dsa.c99
-rw-r--r--net/dsa/dsa_priv.h1
-rw-r--r--net/dsa/slave.c248
-rw-r--r--net/ethernet/eth.c34
-rw-r--r--net/hsr/hsr_device.c2
-rw-r--r--net/ieee802154/6lowpan/core.c13
-rw-r--r--net/ieee802154/6lowpan/reassembly.c1
-rw-r--r--net/ieee802154/socket.c17
-rw-r--r--net/ipv4/Kconfig23
-rw-r--r--net/ipv4/Makefile2
-rw-r--r--net/ipv4/af_inet.c72
-rw-r--r--net/ipv4/arp.c41
-rw-r--r--net/ipv4/devinet.c72
-rw-r--r--net/ipv4/fib_frontend.c29
-rw-r--r--net/ipv4/fib_semantics.c13
-rw-r--r--net/ipv4/fib_trie.c11
-rw-r--r--net/ipv4/fou.c57
-rw-r--r--net/ipv4/gre_offload.c117
-rw-r--r--net/ipv4/icmp.c5
-rw-r--r--net/ipv4/igmp.c98
-rw-r--r--net/ipv4/inet_connection_sock.c272
-rw-r--r--net/ipv4/inet_diag.c89
-rw-r--r--net/ipv4/inet_fragment.c10
-rw-r--r--net/ipv4/inet_hashtables.c237
-rw-r--r--net/ipv4/inet_lro.c374
-rw-r--r--net/ipv4/ip_forward.c1
-rw-r--r--net/ipv4/ip_fragment.c31
-rw-r--r--net/ipv4/ip_gre.c70
-rw-r--r--net/ipv4/ip_input.c33
-rw-r--r--net/ipv4/ip_options.c14
-rw-r--r--net/ipv4/ip_output.c14
-rw-r--r--net/ipv4/ip_sockglue.c57
-rw-r--r--net/ipv4/ip_tunnel.c109
-rw-r--r--net/ipv4/ip_tunnel_core.c69
-rw-r--r--net/ipv4/ip_vti.c3
-rw-r--r--net/ipv4/ipconfig.c66
-rw-r--r--net/ipv4/ipip.c8
-rw-r--r--net/ipv4/ipmr.c764
-rw-r--r--net/ipv4/netfilter/Kconfig1
-rw-r--r--net/ipv4/netfilter/arp_tables.c115
-rw-r--r--net/ipv4/netfilter/arptable_filter.c40
-rw-r--r--net/ipv4/netfilter/ip_tables.c111
-rw-r--r--net/ipv4/netfilter/ipt_SYNPROXY.c55
-rw-r--r--net/ipv4/netfilter/iptable_filter.c44
-rw-r--r--net/ipv4/netfilter/iptable_mangle.c41
-rw-r--r--net/ipv4/netfilter/iptable_nat.c41
-rw-r--r--net/ipv4/netfilter/iptable_raw.c38
-rw-r--r--net/ipv4/netfilter/iptable_security.c44
-rw-r--r--net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nf_defrag_ipv4.c11
-rw-r--r--net/ipv4/netfilter/nf_nat_l3proto_ipv4.c29
-rw-r--r--net/ipv4/netfilter/nf_nat_masquerade_ipv4.c12
-rw-r--r--net/ipv4/netfilter/nf_nat_pptp.c2
-rw-r--r--net/ipv4/netfilter/nf_nat_snmp_basic.c22
-rw-r--r--net/ipv4/netfilter/nf_reject_ipv4.c1
-rw-r--r--net/ipv4/netfilter/nf_tables_arp.c2
-rw-r--r--net/ipv4/netfilter/nf_tables_ipv4.c2
-rw-r--r--net/ipv4/netfilter/nft_masq_ipv4.c7
-rw-r--r--net/ipv4/ping.c17
-rw-r--r--net/ipv4/proc.c2
-rw-r--r--net/ipv4/raw.c23
-rw-r--r--net/ipv4/route.c77
-rw-r--r--net/ipv4/syncookies.c11
-rw-r--r--net/ipv4/sysctl_net_ipv4.c294
-rw-r--r--net/ipv4/tcp.c181
-rw-r--r--net/ipv4/tcp_diag.c21
-rw-r--r--net/ipv4/tcp_fastopen.c79
-rw-r--r--net/ipv4/tcp_input.c221
-rw-r--r--net/ipv4/tcp_ipv4.c153
-rw-r--r--net/ipv4/tcp_memcontrol.c233
-rw-r--r--net/ipv4/tcp_metrics.c5
-rw-r--r--net/ipv4/tcp_minisocks.c16
-rw-r--r--net/ipv4/tcp_offload.c8
-rw-r--r--net/ipv4/tcp_output.c54
-rw-r--r--net/ipv4/tcp_probe.c8
-rw-r--r--net/ipv4/tcp_timer.c38
-rw-r--r--net/ipv4/tcp_yeah.c2
-rw-r--r--net/ipv4/udp.c202
-rw-r--r--net/ipv4/udp_diag.c4
-rw-r--r--net/ipv4/udp_offload.c121
-rw-r--r--net/ipv4/udp_tunnel.c13
-rw-r--r--net/ipv4/xfrm4_policy.c46
-rw-r--r--net/ipv6/Kconfig3
-rw-r--r--net/ipv6/Makefile2
-rw-r--r--net/ipv6/addrconf.c312
-rw-r--r--net/ipv6/addrlabel.c2
-rw-r--r--net/ipv6/af_inet6.c24
-rw-r--r--net/ipv6/datagram.c7
-rw-r--r--net/ipv6/exthdrs.c3
-rw-r--r--net/ipv6/exthdrs_core.c6
-rw-r--r--net/ipv6/icmp.c14
-rw-r--r--net/ipv6/ila/Makefile7
-rw-r--r--net/ipv6/ila/ila.h48
-rw-r--r--net/ipv6/ila/ila_common.c104
-rw-r--r--net/ipv6/ila/ila_lwt.c (renamed from net/ipv6/ila.c)83
-rw-r--r--net/ipv6/ila/ila_xlat.c680
-rw-r--r--net/ipv6/inet6_connection_sock.c27
-rw-r--r--net/ipv6/inet6_hashtables.c78
-rw-r--r--net/ipv6/ip6_checksum.c26
-rw-r--r--net/ipv6/ip6_fib.c91
-rw-r--r--net/ipv6/ip6_flowlabel.c5
-rw-r--r--net/ipv6/ip6_gre.c24
-rw-r--r--net/ipv6/ip6_input.c12
-rw-r--r--net/ipv6/ip6_offload.c15
-rw-r--r--net/ipv6/ip6_output.c19
-rw-r--r--net/ipv6/ip6_tunnel.c107
-rw-r--r--net/ipv6/ip6_udp_tunnel.c6
-rw-r--r--net/ipv6/ip6_vti.c2
-rw-r--r--net/ipv6/ip6mr.c19
-rw-r--r--net/ipv6/ipv6_sockglue.c33
-rw-r--r--net/ipv6/mcast.c5
-rw-r--r--net/ipv6/ndisc.c23
-rw-r--r--net/ipv6/netfilter/Kconfig1
-rw-r--r--net/ipv6/netfilter/ip6_tables.c113
-rw-r--r--net/ipv6/netfilter/ip6table_filter.c47
-rw-r--r--net/ipv6/netfilter/ip6table_mangle.c46
-rw-r--r--net/ipv6/netfilter/ip6table_nat.c41
-rw-r--r--net/ipv6/netfilter/ip6table_raw.c46
-rw-r--r--net/ipv6/netfilter/ip6table_security.c44
-rw-r--r--net/ipv6/netfilter/nf_conntrack_reasm.c172
-rw-r--r--net/ipv6/netfilter/nf_defrag_ipv6_hooks.c20
-rw-r--r--net/ipv6/netfilter/nf_nat_l3proto_ipv6.c29
-rw-r--r--net/ipv6/netfilter/nf_nat_masquerade_ipv6.c74
-rw-r--r--net/ipv6/netfilter/nf_reject_ipv6.c1
-rw-r--r--net/ipv6/netfilter/nf_tables_ipv6.c2
-rw-r--r--net/ipv6/netfilter/nft_masq_ipv6.c7
-rw-r--r--net/ipv6/ping.c59
-rw-r--r--net/ipv6/raw.c28
-rw-r--r--net/ipv6/reassembly.c17
-rw-r--r--net/ipv6/route.c51
-rw-r--r--net/ipv6/sit.c40
-rw-r--r--net/ipv6/syncookies.c11
-rw-r--r--net/ipv6/tcp_ipv6.c149
-rw-r--r--net/ipv6/udp.c118
-rw-r--r--net/ipv6/udp_offload.c8
-rw-r--r--net/ipv6/xfrm6_mode_tunnel.c2
-rw-r--r--net/ipv6/xfrm6_policy.c53
-rw-r--r--net/irda/af_irda.c3
-rw-r--r--net/irda/ircomm/ircomm_param.c3
-rw-r--r--net/irda/ircomm/ircomm_tty.c15
-rw-r--r--net/irda/ircomm/ircomm_tty_ioctl.c13
-rw-r--r--net/iucv/af_iucv.c29
-rw-r--r--net/kcm/Kconfig10
-rw-r--r--net/kcm/Makefile3
-rw-r--r--net/kcm/kcmproc.c426
-rw-r--r--net/kcm/kcmsock.c2409
-rw-r--r--net/l2tp/l2tp_ip.c8
-rw-r--r--net/l2tp/l2tp_ip6.c19
-rw-r--r--net/l2tp/l2tp_netlink.c18
-rw-r--r--net/l2tp/l2tp_ppp.c19
-rw-r--r--net/l3mdev/l3mdev.c11
-rw-r--r--net/llc/af_llc.c4
-rw-r--r--net/mac80211/agg-rx.c52
-rw-r--r--net/mac80211/agg-tx.c56
-rw-r--r--net/mac80211/cfg.c565
-rw-r--r--net/mac80211/chan.c6
-rw-r--r--net/mac80211/debugfs.c9
-rw-r--r--net/mac80211/debugfs_key.c5
-rw-r--r--net/mac80211/driver-ops.c10
-rw-r--r--net/mac80211/driver-ops.h4
-rw-r--r--net/mac80211/ht.c5
-rw-r--r--net/mac80211/ibss.c34
-rw-r--r--net/mac80211/ieee80211_i.h72
-rw-r--r--net/mac80211/iface.c19
-rw-r--r--net/mac80211/key.c142
-rw-r--r--net/mac80211/key.h10
-rw-r--r--net/mac80211/main.c7
-rw-r--r--net/mac80211/mesh.c20
-rw-r--r--net/mac80211/mesh.h7
-rw-r--r--net/mac80211/mesh_hwmp.c8
-rw-r--r--net/mac80211/mesh_pathtbl.c123
-rw-r--r--net/mac80211/mesh_plink.c10
-rw-r--r--net/mac80211/mlme.c104
-rw-r--r--net/mac80211/offchannel.c831
-rw-r--r--net/mac80211/rc80211_minstrel.c2
-rw-r--r--net/mac80211/rc80211_minstrel_ht.c16
-rw-r--r--net/mac80211/rx.c184
-rw-r--r--net/mac80211/scan.c29
-rw-r--r--net/mac80211/sta_info.c212
-rw-r--r--net/mac80211/sta_info.h27
-rw-r--r--net/mac80211/status.c7
-rw-r--r--net/mac80211/tdls.c43
-rw-r--r--net/mac80211/tkip.c36
-rw-r--r--net/mac80211/tkip.h2
-rw-r--r--net/mac80211/trace.h68
-rw-r--r--net/mac80211/tx.c115
-rw-r--r--net/mac80211/util.c252
-rw-r--r--net/mac80211/vht.c97
-rw-r--r--net/mac80211/wpa.c11
-rw-r--r--net/mac802154/driver-ops.h3
-rw-r--r--net/mac802154/llsec.c41
-rw-r--r--net/mac802154/llsec.h3
-rw-r--r--net/mac802154/mac_cmd.c2
-rw-r--r--net/mac802154/main.c2
-rw-r--r--net/mac802154/rx.c3
-rw-r--r--net/mac802154/tx.c9
-rw-r--r--net/mpls/af_mpls.c231
-rw-r--r--net/mpls/internal.h2
-rw-r--r--net/mpls/mpls_iptunnel.c7
-rw-r--r--net/netfilter/Kconfig28
-rw-r--r--net/netfilter/Makefile9
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_gen.h19
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ip.c14
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_ipmac.c66
-rw-r--r--net/netfilter/ipset/ip_set_bitmap_port.c18
-rw-r--r--net/netfilter/ipset/ip_set_core.c158
-rw-r--r--net/netfilter/ipset/ip_set_hash_gen.h28
-rw-r--r--net/netfilter/ipset/ip_set_hash_mac.c3
-rw-r--r--net/netfilter/ipset/ip_set_hash_netiface.c4
-rw-r--r--net/netfilter/ipset/ip_set_list_set.c62
-rw-r--r--net/netfilter/ipvs/ip_vs_app.c8
-rw-r--r--net/netfilter/ipvs/ip_vs_core.c54
-rw-r--r--net/netfilter/ipvs/ip_vs_ctl.c17
-rw-r--r--net/netfilter/ipvs/ip_vs_pe_sip.c6
-rw-r--r--net/netfilter/ipvs/ip_vs_proto_sctp.c2
-rw-r--r--net/netfilter/ipvs/ip_vs_xmit.c12
-rw-r--r--net/netfilter/nf_conntrack_core.c41
-rw-r--r--net/netfilter/nf_conntrack_expect.c7
-rw-r--r--net/netfilter/nf_conntrack_ftp.c17
-rw-r--r--net/netfilter/nf_conntrack_helper.c2
-rw-r--r--net/netfilter/nf_conntrack_irc.c7
-rw-r--r--net/netfilter/nf_conntrack_netlink.c98
-rw-r--r--net/netfilter/nf_conntrack_sane.c19
-rw-r--r--net/netfilter/nf_conntrack_sip.c5
-rw-r--r--net/netfilter/nf_conntrack_standalone.c7
-rw-r--r--net/netfilter/nf_conntrack_tftp.c7
-rw-r--r--net/netfilter/nf_conntrack_timeout.c2
-rw-r--r--net/netfilter/nf_dup_netdev.c39
-rw-r--r--net/netfilter/nf_nat_redirect.c2
-rw-r--r--net/netfilter/nf_tables_api.c263
-rw-r--r--net/netfilter/nf_tables_core.c62
-rw-r--r--net/netfilter/nf_tables_inet.c2
-rw-r--r--net/netfilter/nf_tables_netdev.c57
-rw-r--r--net/netfilter/nf_tables_trace.c275
-rw-r--r--net/netfilter/nfnetlink.c55
-rw-r--r--net/netfilter/nfnetlink_acct.c24
-rw-r--r--net/netfilter/nfnetlink_cthelper.c18
-rw-r--r--net/netfilter/nfnetlink_cttimeout.c122
-rw-r--r--net/netfilter/nfnetlink_log.c47
-rw-r--r--net/netfilter/nfnetlink_queue.c145
-rw-r--r--net/netfilter/nft_byteorder.c21
-rw-r--r--net/netfilter/nft_compat.c12
-rw-r--r--net/netfilter/nft_counter.c51
-rw-r--r--net/netfilter/nft_ct.c39
-rw-r--r--net/netfilter/nft_dup_netdev.c97
-rw-r--r--net/netfilter/nft_dynset.c5
-rw-r--r--net/netfilter/nft_fwd_netdev.c98
-rw-r--r--net/netfilter/nft_limit.c16
-rw-r--r--net/netfilter/nft_masq.c51
-rw-r--r--net/netfilter/nft_meta.c101
-rw-r--r--net/netfilter/nft_payload.c135
-rw-r--r--net/netfilter/x_tables.c80
-rw-r--r--net/netfilter/xt_CT.c2
-rw-r--r--net/netfilter/xt_TCPMSS.c9
-rw-r--r--net/netfilter/xt_TEE.c10
-rw-r--r--net/netfilter/xt_TPROXY.c31
-rw-r--r--net/netfilter/xt_cgroup.c108
-rw-r--r--net/netfilter/xt_osf.c9
-rw-r--r--net/netfilter/xt_owner.c6
-rw-r--r--net/netfilter/xt_socket.c28
-rw-r--r--net/netlabel/netlabel_domainhash.c4
-rw-r--r--net/netlabel/netlabel_unlabeled.c6
-rw-r--r--net/netlink/Kconfig9
-rw-r--r--net/netlink/af_netlink.c787
-rw-r--r--net/netlink/af_netlink.h15
-rw-r--r--net/netlink/diag.c39
-rw-r--r--net/netlink/genetlink.c43
-rw-r--r--net/nfc/core.c13
-rw-r--r--net/nfc/digital_core.c3
-rw-r--r--net/nfc/llcp_commands.c4
-rw-r--r--net/nfc/llcp_sock.c8
-rw-r--r--net/nfc/nci/core.c6
-rw-r--r--net/nfc/nci/hci.c2
-rw-r--r--net/nfc/nci/uart.c9
-rw-r--r--net/nfc/netlink.c37
-rw-r--r--net/nfc/nfc.h1
-rw-r--r--net/openvswitch/Kconfig6
-rw-r--r--net/openvswitch/actions.c27
-rw-r--r--net/openvswitch/conntrack.c705
-rw-r--r--net/openvswitch/conntrack.h3
-rw-r--r--net/openvswitch/datapath.c113
-rw-r--r--net/openvswitch/datapath.h4
-rw-r--r--net/openvswitch/dp_notify.c2
-rw-r--r--net/openvswitch/flow.h2
-rw-r--r--net/openvswitch/flow_netlink.c14
-rw-r--r--net/openvswitch/vport-geneve.c7
-rw-r--r--net/openvswitch/vport-gre.c1
-rw-r--r--net/openvswitch/vport-internal_dev.c10
-rw-r--r--net/openvswitch/vport-netdev.c12
-rw-r--r--net/openvswitch/vport-vxlan.c4
-rw-r--r--net/openvswitch/vport.c8
-rw-r--r--net/openvswitch/vport.h38
-rw-r--r--net/packet/af_packet.c632
-rw-r--r--net/phonet/af_phonet.c4
-rw-r--r--net/phonet/socket.c6
-rw-r--r--net/rds/Kconfig7
-rw-r--r--net/rds/Makefile4
-rw-r--r--net/rds/af_rds.c26
-rw-r--r--net/rds/connection.c6
-rw-r--r--net/rds/ib.c79
-rw-r--r--net/rds/ib.h43
-rw-r--r--net/rds/ib_cm.c61
-rw-r--r--net/rds/ib_fmr.c248
-rw-r--r--net/rds/ib_frmr.c376
-rw-r--r--net/rds/ib_mr.h148
-rw-r--r--net/rds/ib_rdma.c495
-rw-r--r--net/rds/ib_recv.c6
-rw-r--r--net/rds/ib_send.c77
-rw-r--r--net/rds/ib_stats.c2
-rw-r--r--net/rds/iw.c325
-rw-r--r--net/rds/iw.h395
-rw-r--r--net/rds/iw_cm.c769
-rw-r--r--net/rds/iw_rdma.c874
-rw-r--r--net/rds/iw_recv.c904
-rw-r--r--net/rds/iw_ring.c169
-rw-r--r--net/rds/iw_send.c979
-rw-r--r--net/rds/iw_stats.c95
-rw-r--r--net/rds/iw_sysctl.c123
-rw-r--r--net/rds/page.c39
-rw-r--r--net/rds/rdma_transport.c25
-rw-r--r--net/rds/rdma_transport.h5
-rw-r--r--net/rds/rds.h1
-rw-r--r--net/rds/recv.c20
-rw-r--r--net/rds/send.c4
-rw-r--r--net/rds/tcp.c146
-rw-r--r--net/rfkill/Kconfig3
-rw-r--r--net/rfkill/core.c194
-rw-r--r--net/rfkill/rfkill-gpio.c28
-rw-r--r--net/rxrpc/af_rxrpc.c41
-rw-r--r--net/rxrpc/ar-accept.c56
-rw-r--r--net/rxrpc/ar-ack.c229
-rw-r--r--net/rxrpc/ar-call.c88
-rw-r--r--net/rxrpc/ar-connection.c87
-rw-r--r--net/rxrpc/ar-connevent.c79
-rw-r--r--net/rxrpc/ar-error.c13
-rw-r--r--net/rxrpc/ar-input.c118
-rw-r--r--net/rxrpc/ar-internal.h220
-rw-r--r--net/rxrpc/ar-key.c36
-rw-r--r--net/rxrpc/ar-local.c29
-rw-r--r--net/rxrpc/ar-output.c77
-rw-r--r--net/rxrpc/ar-peer.c2
-rw-r--r--net/rxrpc/ar-proc.c10
-rw-r--r--net/rxrpc/ar-recvmsg.c20
-rw-r--r--net/rxrpc/ar-security.c6
-rw-r--r--net/rxrpc/ar-skbuff.c7
-rw-r--r--net/rxrpc/ar-transport.c3
-rw-r--r--net/rxrpc/rxkad.c337
-rw-r--r--net/rxrpc/sysctl.c34
-rw-r--r--net/sched/Kconfig36
-rw-r--r--net/sched/Makefile3
-rw-r--r--net/sched/act_api.c137
-rw-r--r--net/sched/act_bpf.c52
-rw-r--r--net/sched/act_connmark.c54
-rw-r--r--net/sched/act_csum.c67
-rw-r--r--net/sched/act_gact.c55
-rw-r--r--net/sched/act_ife.c870
-rw-r--r--net/sched/act_ipt.c129
-rw-r--r--net/sched/act_meta_mark.c79
-rw-r--r--net/sched/act_meta_skbprio.c76
-rw-r--r--net/sched/act_mirred.c55
-rw-r--r--net/sched/act_nat.c72
-rw-r--r--net/sched/act_pedit.c54
-rw-r--r--net/sched/act_police.c52
-rw-r--r--net/sched/act_simple.c55
-rw-r--r--net/sched/act_skbedit.c54
-rw-r--r--net/sched/act_vlan.c54
-rw-r--r--net/sched/cls_bpf.c21
-rw-r--r--net/sched/cls_flow.c15
-rw-r--r--net/sched/cls_flower.c74
-rw-r--r--net/sched/cls_u32.c118
-rw-r--r--net/sched/em_meta.c138
-rw-r--r--net/sched/sch_api.c36
-rw-r--r--net/sched/sch_cbq.c12
-rw-r--r--net/sched/sch_choke.c6
-rw-r--r--net/sched/sch_codel.c10
-rw-r--r--net/sched/sch_drr.c11
-rw-r--r--net/sched/sch_dsmark.c13
-rw-r--r--net/sched/sch_fq.c4
-rw-r--r--net/sched/sch_fq_codel.c17
-rw-r--r--net/sched/sch_generic.c7
-rw-r--r--net/sched/sch_hfsc.c9
-rw-r--r--net/sched/sch_hhf.c10
-rw-r--r--net/sched/sch_htb.c24
-rw-r--r--net/sched/sch_ingress.c88
-rw-r--r--net/sched/sch_mq.c6
-rw-r--r--net/sched/sch_mqprio.c15
-rw-r--r--net/sched/sch_multiq.c16
-rw-r--r--net/sched/sch_netem.c13
-rw-r--r--net/sched/sch_pie.c5
-rw-r--r--net/sched/sch_prio.c15
-rw-r--r--net/sched/sch_qfq.c9
-rw-r--r--net/sched/sch_red.c10
-rw-r--r--net/sched/sch_sfb.c10
-rw-r--r--net/sched/sch_sfq.c16
-rw-r--r--net/sched/sch_tbf.c15
-rw-r--r--net/sctp/associola.c14
-rw-r--r--net/sctp/auth.c40
-rw-r--r--net/sctp/bind_addr.c14
-rw-r--r--net/sctp/chunk.c19
-rw-r--r--net/sctp/endpointola.c53
-rw-r--r--net/sctp/input.c205
-rw-r--r--net/sctp/ipv6.c26
-rw-r--r--net/sctp/output.c17
-rw-r--r--net/sctp/outqueue.c38
-rw-r--r--net/sctp/probe.c10
-rw-r--r--net/sctp/proc.c328
-rw-r--r--net/sctp/protocol.c84
-rw-r--r--net/sctp/sm_make_chunk.c138
-rw-r--r--net/sctp/sm_sideeffect.c59
-rw-r--r--net/sctp/sm_statefuns.c26
-rw-r--r--net/sctp/socket.c107
-rw-r--r--net/sctp/sysctl.c9
-rw-r--r--net/sctp/transport.c16
-rw-r--r--net/socket.c91
-rw-r--r--net/sunrpc/Makefile3
-rw-r--r--net/sunrpc/auth_gss/auth_gss.c27
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_crypto.c356
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_keys.c12
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_mech.c90
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_seqnum.c22
-rw-r--r--net/sunrpc/auth_gss/gss_krb5_wrap.c28
-rw-r--r--net/sunrpc/auth_gss/gss_rpc_upcall.c3
-rw-r--r--net/sunrpc/auth_null.c4
-rw-r--r--net/sunrpc/auth_unix.c6
-rw-r--r--net/sunrpc/backchannel_rqst.c24
-rw-r--r--net/sunrpc/cache.c75
-rw-r--r--net/sunrpc/clnt.c329
-rw-r--r--net/sunrpc/rpc_pipe.c66
-rw-r--r--net/sunrpc/rpcb_clnt.c10
-rw-r--r--net/sunrpc/sched.c8
-rw-r--r--net/sunrpc/socklib.c6
-rw-r--r--net/sunrpc/svc.c18
-rw-r--r--net/sunrpc/svc_xprt.c45
-rw-r--r--net/sunrpc/svcauth.c2
-rw-r--r--net/sunrpc/svcauth_unix.c8
-rw-r--r--net/sunrpc/svcsock.c40
-rw-r--r--net/sunrpc/sysctl.c23
-rw-r--r--net/sunrpc/xdr.c50
-rw-r--r--net/sunrpc/xprt.c43
-rw-r--r--net/sunrpc/xprtmultipath.c475
-rw-r--r--net/sunrpc/xprtrdma/Makefile3
-rw-r--r--net/sunrpc/xprtrdma/backchannel.c394
-rw-r--r--net/sunrpc/xprtrdma/fmr_ops.c90
-rw-r--r--net/sunrpc/xprtrdma/frwr_ops.c362
-rw-r--r--net/sunrpc/xprtrdma/physical_ops.c14
-rw-r--r--net/sunrpc/xprtrdma/rpc_rdma.c256
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma.c47
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_backchannel.c364
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_marshal.c64
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_recvfrom.c209
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_sendto.c239
-rw-r--r--net/sunrpc/xprtrdma/svc_rdma_transport.c841
-rw-r--r--net/sunrpc/xprtrdma/transport.c51
-rw-r--r--net/sunrpc/xprtrdma/verbs.c604
-rw-r--r--net/sunrpc/xprtrdma/xprt_rdma.h98
-rw-r--r--net/sunrpc/xprtsock.c327
-rw-r--r--net/switchdev/switchdev.c36
-rw-r--r--net/tipc/bcast.c129
-rw-r--r--net/tipc/bcast.h2
-rw-r--r--net/tipc/bearer.c158
-rw-r--r--net/tipc/bearer.h8
-rw-r--r--net/tipc/core.h5
-rw-r--r--net/tipc/discover.c38
-rw-r--r--net/tipc/link.c755
-rw-r--r--net/tipc/link.h176
-rw-r--r--net/tipc/name_distr.c68
-rw-r--r--net/tipc/name_distr.h1
-rw-r--r--net/tipc/name_table.c25
-rw-r--r--net/tipc/net.c7
-rw-r--r--net/tipc/netlink.c77
-rw-r--r--net/tipc/netlink.h11
-rw-r--r--net/tipc/netlink_compat.c10
-rw-r--r--net/tipc/node.c943
-rw-r--r--net/tipc/node.h127
-rw-r--r--net/tipc/server.c4
-rw-r--r--net/tipc/socket.c56
-rw-r--r--net/tipc/subscr.c132
-rw-r--r--net/tipc/subscr.h11
-rw-r--r--net/tipc/udp_media.c68
-rw-r--r--net/unix/af_unix.c470
-rw-r--r--net/unix/diag.c2
-rw-r--r--net/unix/garbage.c17
-rw-r--r--net/vmw_vsock/af_vsock.c158
-rw-r--r--net/vmw_vsock/vmci_transport.c6
-rw-r--r--net/vmw_vsock/vmci_transport.h2
-rw-r--r--net/vmw_vsock/vmci_transport_notify.c2
-rw-r--r--net/vmw_vsock/vmci_transport_notify.h5
-rw-r--r--net/vmw_vsock/vmci_transport_notify_qstate.c2
-rw-r--r--net/wireless/Kconfig25
-rw-r--r--net/wireless/core.c12
-rw-r--r--net/wireless/core.h7
-rw-r--r--net/wireless/lib80211_crypt_ccmp.c4
-rw-r--r--net/wireless/lib80211_crypt_tkip.c103
-rw-r--r--net/wireless/lib80211_crypt_wep.c46
-rw-r--r--net/wireless/mlme.c3
-rw-r--r--net/wireless/nl80211.c96
-rw-r--r--net/wireless/ocb.c3
-rw-r--r--net/wireless/radiotap.c1
-rw-r--r--net/wireless/rdev-ops.h51
-rw-r--r--net/wireless/reg.c252
-rw-r--r--net/wireless/sme.c15
-rw-r--r--net/wireless/trace.h103
-rw-r--r--net/wireless/util.c398
-rw-r--r--net/wireless/wext-core.c52
-rw-r--r--net/xfrm/xfrm_algo.c7
-rw-r--r--net/xfrm/xfrm_input.c3
-rw-r--r--net/xfrm/xfrm_output.c2
-rw-r--r--net/xfrm/xfrm_policy.c88
-rw-r--r--net/xfrm/xfrm_user.c2
682 files changed, 37940 insertions, 23141 deletions
diff --git a/net/6lowpan/6lowpan_i.h b/net/6lowpan/6lowpan_i.h
new file mode 100644
index 000000000000..d16bb4b14aa1
--- /dev/null
+++ b/net/6lowpan/6lowpan_i.h
@@ -0,0 +1,28 @@
+#ifndef __6LOWPAN_I_H
+#define __6LOWPAN_I_H
+
+#include <linux/netdevice.h>
+
+#ifdef CONFIG_6LOWPAN_DEBUGFS
+int lowpan_dev_debugfs_init(struct net_device *dev);
+void lowpan_dev_debugfs_exit(struct net_device *dev);
+
+int __init lowpan_debugfs_init(void);
+void lowpan_debugfs_exit(void);
+#else
+static inline int lowpan_dev_debugfs_init(struct net_device *dev)
+{
+ return 0;
+}
+
+static inline void lowpan_dev_debugfs_exit(struct net_device *dev) { }
+
+static inline int __init lowpan_debugfs_init(void)
+{
+ return 0;
+}
+
+static inline void lowpan_debugfs_exit(void) { }
+#endif /* CONFIG_6LOWPAN_DEBUGFS */
+
+#endif /* __6LOWPAN_I_H */
diff --git a/net/6lowpan/Kconfig b/net/6lowpan/Kconfig
index 7fa0f382e7d1..9c051512d14f 100644
--- a/net/6lowpan/Kconfig
+++ b/net/6lowpan/Kconfig
@@ -5,12 +5,21 @@ menuconfig 6LOWPAN
This enables IPv6 over Low power Wireless Personal Area Network -
"6LoWPAN" which is supported by IEEE 802.15.4 or Bluetooth stacks.
+config 6LOWPAN_DEBUGFS
+ bool "6LoWPAN debugfs support"
+ depends on 6LOWPAN
+ depends on DEBUG_FS
+ ---help---
+ This enables 6LoWPAN debugfs support. For example to manipulate
+ IPHC context information at runtime.
+
menuconfig 6LOWPAN_NHC
- tristate "Next Header Compression Support"
+ tristate "Next Header and Generic Header Compression Support"
depends on 6LOWPAN
default y
---help---
- Support for next header compression.
+ Support for next header and generic header compression defined in
+ RFC6282 and RFC7400.
if 6LOWPAN_NHC
@@ -58,4 +67,38 @@ config 6LOWPAN_NHC_UDP
---help---
6LoWPAN IPv6 UDP Header compression according to RFC6282.
+config 6LOWPAN_GHC_EXT_HDR_HOP
+ tristate "GHC Hop-by-Hop Options Header Support"
+ ---help---
+ 6LoWPAN IPv6 Hop-by-Hop option generic header compression according
+ to RFC7400.
+
+config 6LOWPAN_GHC_UDP
+ tristate "GHC UDP Support"
+ ---help---
+ 6LoWPAN IPv6 UDP generic header compression according to RFC7400.
+
+config 6LOWPAN_GHC_ICMPV6
+ tristate "GHC ICMPv6 Support"
+ ---help---
+ 6LoWPAN IPv6 ICMPv6 generic header compression according to RFC7400.
+
+config 6LOWPAN_GHC_EXT_HDR_DEST
+ tristate "GHC Destination Options Header Support"
+ ---help---
+ 6LoWPAN IPv6 destination option generic header compression according
+ to RFC7400.
+
+config 6LOWPAN_GHC_EXT_HDR_FRAG
+ tristate "GHC Fragmentation Options Header Support"
+ ---help---
+ 6LoWPAN IPv6 fragmentation option generic header compression
+ according to RFC7400.
+
+config 6LOWPAN_GHC_EXT_HDR_ROUTE
+ tristate "GHC Routing Options Header Support"
+ ---help---
+ 6LoWPAN IPv6 routing option generic header compression according
+ to RFC7400.
+
endif
diff --git a/net/6lowpan/Makefile b/net/6lowpan/Makefile
index c6ffc55ee0d7..e44f3bf2dd42 100644
--- a/net/6lowpan/Makefile
+++ b/net/6lowpan/Makefile
@@ -1,6 +1,7 @@
obj-$(CONFIG_6LOWPAN) += 6lowpan.o
6lowpan-y := core.o iphc.o nhc.o
+6lowpan-$(CONFIG_6LOWPAN_DEBUGFS) += debugfs.o
#rfc6282 nhcs
obj-$(CONFIG_6LOWPAN_NHC_DEST) += nhc_dest.o
@@ -10,3 +11,11 @@ obj-$(CONFIG_6LOWPAN_NHC_IPV6) += nhc_ipv6.o
obj-$(CONFIG_6LOWPAN_NHC_MOBILITY) += nhc_mobility.o
obj-$(CONFIG_6LOWPAN_NHC_ROUTING) += nhc_routing.o
obj-$(CONFIG_6LOWPAN_NHC_UDP) += nhc_udp.o
+
+#rfc7400 ghcs
+obj-$(CONFIG_6LOWPAN_GHC_EXT_HDR_HOP) += nhc_ghc_ext_hop.o
+obj-$(CONFIG_6LOWPAN_GHC_UDP) += nhc_ghc_udp.o
+obj-$(CONFIG_6LOWPAN_GHC_ICMPV6) += nhc_ghc_icmpv6.o
+obj-$(CONFIG_6LOWPAN_GHC_EXT_HDR_DEST) += nhc_ghc_ext_dest.o
+obj-$(CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG) += nhc_ghc_ext_frag.o
+obj-$(CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE) += nhc_ghc_ext_route.o
diff --git a/net/6lowpan/core.c b/net/6lowpan/core.c
index 83b19e072224..34e44c0c0836 100644
--- a/net/6lowpan/core.c
+++ b/net/6lowpan/core.c
@@ -15,19 +15,103 @@
#include <net/6lowpan.h>
-void lowpan_netdev_setup(struct net_device *dev, enum lowpan_lltypes lltype)
+#include "6lowpan_i.h"
+
+int lowpan_register_netdevice(struct net_device *dev,
+ enum lowpan_lltypes lltype)
{
+ int i, ret;
+
dev->addr_len = EUI64_ADDR_LEN;
dev->type = ARPHRD_6LOWPAN;
dev->mtu = IPV6_MIN_MTU;
dev->priv_flags |= IFF_NO_QUEUE;
lowpan_priv(dev)->lltype = lltype;
+
+ spin_lock_init(&lowpan_priv(dev)->ctx.lock);
+ for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
+ lowpan_priv(dev)->ctx.table[i].id = i;
+
+ ret = register_netdevice(dev);
+ if (ret < 0)
+ return ret;
+
+ ret = lowpan_dev_debugfs_init(dev);
+ if (ret < 0)
+ unregister_netdevice(dev);
+
+ return ret;
+}
+EXPORT_SYMBOL(lowpan_register_netdevice);
+
+int lowpan_register_netdev(struct net_device *dev,
+ enum lowpan_lltypes lltype)
+{
+ int ret;
+
+ rtnl_lock();
+ ret = lowpan_register_netdevice(dev, lltype);
+ rtnl_unlock();
+ return ret;
+}
+EXPORT_SYMBOL(lowpan_register_netdev);
+
+void lowpan_unregister_netdevice(struct net_device *dev)
+{
+ unregister_netdevice(dev);
+ lowpan_dev_debugfs_exit(dev);
+}
+EXPORT_SYMBOL(lowpan_unregister_netdevice);
+
+void lowpan_unregister_netdev(struct net_device *dev)
+{
+ rtnl_lock();
+ lowpan_unregister_netdevice(dev);
+ rtnl_unlock();
+}
+EXPORT_SYMBOL(lowpan_unregister_netdev);
+
+static int lowpan_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ int i;
+
+ if (dev->type != ARPHRD_6LOWPAN)
+ return NOTIFY_DONE;
+
+ switch (event) {
+ case NETDEV_DOWN:
+ for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++)
+ clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE,
+ &lowpan_priv(dev)->ctx.table[i].flags);
+ break;
+ default:
+ return NOTIFY_DONE;
+ }
+
+ return NOTIFY_OK;
}
-EXPORT_SYMBOL(lowpan_netdev_setup);
+
+static struct notifier_block lowpan_notifier = {
+ .notifier_call = lowpan_event,
+};
static int __init lowpan_module_init(void)
{
+ int ret;
+
+ ret = lowpan_debugfs_init();
+ if (ret < 0)
+ return ret;
+
+ ret = register_netdevice_notifier(&lowpan_notifier);
+ if (ret < 0) {
+ lowpan_debugfs_exit();
+ return ret;
+ }
+
request_module_nowait("ipv6");
request_module_nowait("nhc_dest");
@@ -40,6 +124,14 @@ static int __init lowpan_module_init(void)
return 0;
}
+
+static void __exit lowpan_module_exit(void)
+{
+ lowpan_debugfs_exit();
+ unregister_netdevice_notifier(&lowpan_notifier);
+}
+
module_init(lowpan_module_init);
+module_exit(lowpan_module_exit);
MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/debugfs.c b/net/6lowpan/debugfs.c
new file mode 100644
index 000000000000..0793a8157472
--- /dev/null
+++ b/net/6lowpan/debugfs.c
@@ -0,0 +1,300 @@
+/* This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
+ * GNU General Public License for more details.
+ *
+ * Authors:
+ * (C) 2015 Pengutronix, Alexander Aring <aar@pengutronix.de>
+ * Copyright (c) 2015 Nordic Semiconductor. All Rights Reserved.
+ */
+
+#include <net/6lowpan.h>
+
+#include "6lowpan_i.h"
+
+#define LOWPAN_DEBUGFS_CTX_PFX_NUM_ARGS 8
+
+static struct dentry *lowpan_debugfs;
+
+static int lowpan_ctx_flag_active_set(void *data, u64 val)
+{
+ struct lowpan_iphc_ctx *ctx = data;
+
+ if (val != 0 && val != 1)
+ return -EINVAL;
+
+ if (val)
+ set_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags);
+ else
+ clear_bit(LOWPAN_IPHC_CTX_FLAG_ACTIVE, &ctx->flags);
+
+ return 0;
+}
+
+static int lowpan_ctx_flag_active_get(void *data, u64 *val)
+{
+ *val = lowpan_iphc_ctx_is_active(data);
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_flag_active_fops,
+ lowpan_ctx_flag_active_get,
+ lowpan_ctx_flag_active_set, "%llu\n");
+
+static int lowpan_ctx_flag_c_set(void *data, u64 val)
+{
+ struct lowpan_iphc_ctx *ctx = data;
+
+ if (val != 0 && val != 1)
+ return -EINVAL;
+
+ if (val)
+ set_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags);
+ else
+ clear_bit(LOWPAN_IPHC_CTX_FLAG_COMPRESSION, &ctx->flags);
+
+ return 0;
+}
+
+static int lowpan_ctx_flag_c_get(void *data, u64 *val)
+{
+ *val = lowpan_iphc_ctx_is_compression(data);
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_flag_c_fops, lowpan_ctx_flag_c_get,
+ lowpan_ctx_flag_c_set, "%llu\n");
+
+static int lowpan_ctx_plen_set(void *data, u64 val)
+{
+ struct lowpan_iphc_ctx *ctx = data;
+ struct lowpan_iphc_ctx_table *t =
+ container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+
+ if (val > 128)
+ return -EINVAL;
+
+ spin_lock_bh(&t->lock);
+ ctx->plen = val;
+ spin_unlock_bh(&t->lock);
+
+ return 0;
+}
+
+static int lowpan_ctx_plen_get(void *data, u64 *val)
+{
+ struct lowpan_iphc_ctx *ctx = data;
+ struct lowpan_iphc_ctx_table *t =
+ container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+
+ spin_lock_bh(&t->lock);
+ *val = ctx->plen;
+ spin_unlock_bh(&t->lock);
+ return 0;
+}
+
+DEFINE_SIMPLE_ATTRIBUTE(lowpan_ctx_plen_fops, lowpan_ctx_plen_get,
+ lowpan_ctx_plen_set, "%llu\n");
+
+static int lowpan_ctx_pfx_show(struct seq_file *file, void *offset)
+{
+ struct lowpan_iphc_ctx *ctx = file->private;
+ struct lowpan_iphc_ctx_table *t =
+ container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+
+ spin_lock_bh(&t->lock);
+ seq_printf(file, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x\n",
+ be16_to_cpu(ctx->pfx.s6_addr16[0]),
+ be16_to_cpu(ctx->pfx.s6_addr16[1]),
+ be16_to_cpu(ctx->pfx.s6_addr16[2]),
+ be16_to_cpu(ctx->pfx.s6_addr16[3]),
+ be16_to_cpu(ctx->pfx.s6_addr16[4]),
+ be16_to_cpu(ctx->pfx.s6_addr16[5]),
+ be16_to_cpu(ctx->pfx.s6_addr16[6]),
+ be16_to_cpu(ctx->pfx.s6_addr16[7]));
+ spin_unlock_bh(&t->lock);
+
+ return 0;
+}
+
+static int lowpan_ctx_pfx_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, lowpan_ctx_pfx_show, inode->i_private);
+}
+
+static ssize_t lowpan_ctx_pfx_write(struct file *fp,
+ const char __user *user_buf, size_t count,
+ loff_t *ppos)
+{
+ char buf[128] = {};
+ struct seq_file *file = fp->private_data;
+ struct lowpan_iphc_ctx *ctx = file->private;
+ struct lowpan_iphc_ctx_table *t =
+ container_of(ctx, struct lowpan_iphc_ctx_table, table[ctx->id]);
+ int status = count, n, i;
+ unsigned int addr[8];
+
+ if (copy_from_user(&buf, user_buf, min_t(size_t, sizeof(buf) - 1,
+ count))) {
+ status = -EFAULT;
+ goto out;
+ }
+
+ n = sscanf(buf, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x",
+ &addr[0], &addr[1], &addr[2], &addr[3], &addr[4],
+ &addr[5], &addr[6], &addr[7]);
+ if (n != LOWPAN_DEBUGFS_CTX_PFX_NUM_ARGS) {
+ status = -EINVAL;
+ goto out;
+ }
+
+ spin_lock_bh(&t->lock);
+ for (i = 0; i < 8; i++)
+ ctx->pfx.s6_addr16[i] = cpu_to_be16(addr[i] & 0xffff);
+ spin_unlock_bh(&t->lock);
+
+out:
+ return status;
+}
+
+static const struct file_operations lowpan_ctx_pfx_fops = {
+ .open = lowpan_ctx_pfx_open,
+ .read = seq_read,
+ .write = lowpan_ctx_pfx_write,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+static int lowpan_dev_debugfs_ctx_init(struct net_device *dev,
+ struct dentry *ctx, u8 id)
+{
+ struct lowpan_priv *lpriv = lowpan_priv(dev);
+ struct dentry *dentry, *root;
+ char buf[32];
+
+ WARN_ON_ONCE(id > LOWPAN_IPHC_CTX_TABLE_SIZE);
+
+ sprintf(buf, "%d", id);
+
+ root = debugfs_create_dir(buf, ctx);
+ if (!root)
+ return -EINVAL;
+
+ dentry = debugfs_create_file("active", 0644, root,
+ &lpriv->ctx.table[id],
+ &lowpan_ctx_flag_active_fops);
+ if (!dentry)
+ return -EINVAL;
+
+ dentry = debugfs_create_file("compression", 0644, root,
+ &lpriv->ctx.table[id],
+ &lowpan_ctx_flag_c_fops);
+ if (!dentry)
+ return -EINVAL;
+
+ dentry = debugfs_create_file("prefix", 0644, root,
+ &lpriv->ctx.table[id],
+ &lowpan_ctx_pfx_fops);
+ if (!dentry)
+ return -EINVAL;
+
+ dentry = debugfs_create_file("prefix_len", 0644, root,
+ &lpriv->ctx.table[id],
+ &lowpan_ctx_plen_fops);
+ if (!dentry)
+ return -EINVAL;
+
+ return 0;
+}
+
+static int lowpan_context_show(struct seq_file *file, void *offset)
+{
+ struct lowpan_iphc_ctx_table *t = file->private;
+ int i;
+
+ seq_printf(file, "%3s|%-43s|%c\n", "cid", "prefix", 'C');
+ seq_puts(file, "-------------------------------------------------\n");
+
+ spin_lock_bh(&t->lock);
+ for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
+ if (!lowpan_iphc_ctx_is_active(&t->table[i]))
+ continue;
+
+ seq_printf(file, "%3d|%39pI6c/%-3d|%d\n", t->table[i].id,
+ &t->table[i].pfx, t->table[i].plen,
+ lowpan_iphc_ctx_is_compression(&t->table[i]));
+ }
+ spin_unlock_bh(&t->lock);
+
+ return 0;
+}
+
+static int lowpan_context_open(struct inode *inode, struct file *file)
+{
+ return single_open(file, lowpan_context_show, inode->i_private);
+}
+
+static const struct file_operations lowpan_context_fops = {
+ .open = lowpan_context_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release,
+};
+
+int lowpan_dev_debugfs_init(struct net_device *dev)
+{
+ struct lowpan_priv *lpriv = lowpan_priv(dev);
+ struct dentry *contexts, *dentry;
+ int ret, i;
+
+ /* creating the root */
+ lpriv->iface_debugfs = debugfs_create_dir(dev->name, lowpan_debugfs);
+ if (!lpriv->iface_debugfs)
+ goto fail;
+
+ contexts = debugfs_create_dir("contexts", lpriv->iface_debugfs);
+ if (!contexts)
+ goto remove_root;
+
+ dentry = debugfs_create_file("show", 0644, contexts,
+ &lowpan_priv(dev)->ctx,
+ &lowpan_context_fops);
+ if (!dentry)
+ goto remove_root;
+
+ for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
+ ret = lowpan_dev_debugfs_ctx_init(dev, contexts, i);
+ if (ret < 0)
+ goto remove_root;
+ }
+
+ return 0;
+
+remove_root:
+ lowpan_dev_debugfs_exit(dev);
+fail:
+ return -EINVAL;
+}
+
+void lowpan_dev_debugfs_exit(struct net_device *dev)
+{
+ debugfs_remove_recursive(lowpan_priv(dev)->iface_debugfs);
+}
+
+int __init lowpan_debugfs_init(void)
+{
+ lowpan_debugfs = debugfs_create_dir("6lowpan", NULL);
+ if (!lowpan_debugfs)
+ return -EINVAL;
+
+ return 0;
+}
+
+void lowpan_debugfs_exit(void)
+{
+ debugfs_remove_recursive(lowpan_debugfs);
+}
diff --git a/net/6lowpan/iphc.c b/net/6lowpan/iphc.c
index 346b5c1a9185..99bb22aea346 100644
--- a/net/6lowpan/iphc.c
+++ b/net/6lowpan/iphc.c
@@ -56,6 +56,7 @@
/* special link-layer handling */
#include <net/mac802154.h>
+#include "6lowpan_i.h"
#include "nhc.h"
/* Values of fields within the IPHC encoding first byte */
@@ -147,6 +148,9 @@
(((a)->s6_addr16[6]) == 0) && \
(((a)->s6_addr[14]) == 0))
+#define LOWPAN_IPHC_CID_DCI(cid) (cid & 0x0f)
+#define LOWPAN_IPHC_CID_SCI(cid) ((cid & 0xf0) >> 4)
+
static inline void iphc_uncompress_eui64_lladdr(struct in6_addr *ipaddr,
const void *lladdr)
{
@@ -195,6 +199,98 @@ static inline void iphc_uncompress_802154_lladdr(struct in6_addr *ipaddr,
}
}
+static struct lowpan_iphc_ctx *
+lowpan_iphc_ctx_get_by_id(const struct net_device *dev, u8 id)
+{
+ struct lowpan_iphc_ctx *ret = &lowpan_priv(dev)->ctx.table[id];
+
+ if (!lowpan_iphc_ctx_is_active(ret))
+ return NULL;
+
+ return ret;
+}
+
+static struct lowpan_iphc_ctx *
+lowpan_iphc_ctx_get_by_addr(const struct net_device *dev,
+ const struct in6_addr *addr)
+{
+ struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table;
+ struct lowpan_iphc_ctx *ret = NULL;
+ struct in6_addr addr_pfx;
+ u8 addr_plen;
+ int i;
+
+ for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
+ /* Check if context is valid. A context that is not valid
+ * MUST NOT be used for compression.
+ */
+ if (!lowpan_iphc_ctx_is_active(&table[i]) ||
+ !lowpan_iphc_ctx_is_compression(&table[i]))
+ continue;
+
+ ipv6_addr_prefix(&addr_pfx, addr, table[i].plen);
+
+ /* if prefix len < 64, the remaining bits until 64th bit is
+ * zero. Otherwise we use table[i]->plen.
+ */
+ if (table[i].plen < 64)
+ addr_plen = 64;
+ else
+ addr_plen = table[i].plen;
+
+ if (ipv6_prefix_equal(&addr_pfx, &table[i].pfx, addr_plen)) {
+ /* remember first match */
+ if (!ret) {
+ ret = &table[i];
+ continue;
+ }
+
+ /* get the context with longest prefix len */
+ if (table[i].plen > ret->plen)
+ ret = &table[i];
+ }
+ }
+
+ return ret;
+}
+
+static struct lowpan_iphc_ctx *
+lowpan_iphc_ctx_get_by_mcast_addr(const struct net_device *dev,
+ const struct in6_addr *addr)
+{
+ struct lowpan_iphc_ctx *table = lowpan_priv(dev)->ctx.table;
+ struct lowpan_iphc_ctx *ret = NULL;
+ struct in6_addr addr_mcast, network_pfx = {};
+ int i;
+
+ /* init mcast address with */
+ memcpy(&addr_mcast, addr, sizeof(*addr));
+
+ for (i = 0; i < LOWPAN_IPHC_CTX_TABLE_SIZE; i++) {
+ /* Check if context is valid. A context that is not valid
+ * MUST NOT be used for compression.
+ */
+ if (!lowpan_iphc_ctx_is_active(&table[i]) ||
+ !lowpan_iphc_ctx_is_compression(&table[i]))
+ continue;
+
+ /* setting plen */
+ addr_mcast.s6_addr[3] = table[i].plen;
+ /* get network prefix to copy into multicast address */
+ ipv6_addr_prefix(&network_pfx, &table[i].pfx,
+ table[i].plen);
+ /* setting network prefix */
+ memcpy(&addr_mcast.s6_addr[4], &network_pfx, 8);
+
+ if (ipv6_addr_equal(addr, &addr_mcast)) {
+ ret = &table[i];
+ break;
+ }
+ }
+
+ return ret;
+}
+
/* Uncompress address function for source and
* destination address(non-multicast).
*
@@ -259,30 +355,59 @@ static int uncompress_addr(struct sk_buff *skb, const struct net_device *dev,
/* Uncompress address function for source context
* based address(non-multicast).
*/
-static int uncompress_context_based_src_addr(struct sk_buff *skb,
- struct in6_addr *ipaddr,
- u8 address_mode)
+static int uncompress_ctx_addr(struct sk_buff *skb,
+ const struct net_device *dev,
+ const struct lowpan_iphc_ctx *ctx,
+ struct in6_addr *ipaddr, u8 address_mode,
+ const void *lladdr)
{
+ bool fail;
+
switch (address_mode) {
- case LOWPAN_IPHC_SAM_00:
- /* unspec address ::
+ /* SAM and DAM are the same here */
+ case LOWPAN_IPHC_DAM_00:
+ fail = false;
+ /* SAM_00 -> unspec address ::
* Do nothing, address is already ::
+ *
+ * DAM 00 -> reserved should never occur.
*/
break;
case LOWPAN_IPHC_SAM_01:
- /* TODO */
+ case LOWPAN_IPHC_DAM_01:
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[8], 8);
+ ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen);
+ break;
case LOWPAN_IPHC_SAM_10:
- /* TODO */
+ case LOWPAN_IPHC_DAM_10:
+ ipaddr->s6_addr[11] = 0xFF;
+ ipaddr->s6_addr[12] = 0xFE;
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[14], 2);
+ ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen);
+ break;
case LOWPAN_IPHC_SAM_11:
- /* TODO */
- netdev_warn(skb->dev, "SAM value 0x%x not supported\n",
- address_mode);
- return -EINVAL;
+ case LOWPAN_IPHC_DAM_11:
+ fail = false;
+ switch (lowpan_priv(dev)->lltype) {
+ case LOWPAN_LLTYPE_IEEE802154:
+ iphc_uncompress_802154_lladdr(ipaddr, lladdr);
+ break;
+ default:
+ iphc_uncompress_eui64_lladdr(ipaddr, lladdr);
+ break;
+ }
+ ipv6_addr_prefix_copy(ipaddr, &ctx->pfx, ctx->plen);
+ break;
default:
pr_debug("Invalid sam value: 0x%x\n", address_mode);
return -EINVAL;
}
+ if (fail) {
+ pr_debug("Failed to fetch skb data\n");
+ return -EIO;
+ }
+
raw_dump_inline(NULL,
"Reconstructed context based ipv6 src addr is",
ipaddr->s6_addr, 16);
@@ -346,6 +471,30 @@ static int lowpan_uncompress_multicast_daddr(struct sk_buff *skb,
return 0;
}
+static int lowpan_uncompress_multicast_ctx_daddr(struct sk_buff *skb,
+ struct lowpan_iphc_ctx *ctx,
+ struct in6_addr *ipaddr,
+ u8 address_mode)
+{
+ struct in6_addr network_pfx = {};
+ bool fail;
+
+ ipaddr->s6_addr[0] = 0xFF;
+ fail = lowpan_fetch_skb(skb, &ipaddr->s6_addr[1], 2);
+ fail |= lowpan_fetch_skb(skb, &ipaddr->s6_addr[12], 4);
+ if (fail)
+ return -EIO;
+
+ /* take prefix_len and network prefix from the context */
+ ipaddr->s6_addr[3] = ctx->plen;
+ /* get network prefix to copy into multicast address */
+ ipv6_addr_prefix(&network_pfx, &ctx->pfx, ctx->plen);
+ /* setting network prefix */
+ memcpy(&ipaddr->s6_addr[4], &network_pfx, 8);
+
+ return 0;
+}
+
/* get the ecn values from iphc tf format and set it to ipv6hdr */
static inline void lowpan_iphc_tf_set_ecn(struct ipv6hdr *hdr, const u8 *tf)
{
@@ -459,7 +608,8 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
const void *daddr, const void *saddr)
{
struct ipv6hdr hdr = {};
- u8 iphc0, iphc1;
+ struct lowpan_iphc_ctx *ci;
+ u8 iphc0, iphc1, cid = 0;
int err;
raw_dump_table(__func__, "raw skb data dump uncompressed",
@@ -469,12 +619,14 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
lowpan_fetch_skb(skb, &iphc1, sizeof(iphc1)))
return -EINVAL;
- /* another if the CID flag is set */
- if (iphc1 & LOWPAN_IPHC_CID)
- return -ENOTSUPP;
-
hdr.version = 6;
+ /* default CID = 0, another if the CID flag is set */
+ if (iphc1 & LOWPAN_IPHC_CID) {
+ if (lowpan_fetch_skb(skb, &cid, sizeof(cid)))
+ return -EINVAL;
+ }
+
err = lowpan_iphc_tf_decompress(skb, &hdr,
iphc0 & LOWPAN_IPHC_TF_MASK);
if (err < 0)
@@ -500,10 +652,17 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
}
if (iphc1 & LOWPAN_IPHC_SAC) {
- /* Source address context based uncompression */
+ spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+ ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_SCI(cid));
+ if (!ci) {
+ spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+ return -EINVAL;
+ }
+
pr_debug("SAC bit is set. Handle context based source address.\n");
- err = uncompress_context_based_src_addr(skb, &hdr.saddr,
- iphc1 & LOWPAN_IPHC_SAM_MASK);
+ err = uncompress_ctx_addr(skb, dev, ci, &hdr.saddr,
+ iphc1 & LOWPAN_IPHC_SAM_MASK, saddr);
+ spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
} else {
/* Source address uncompression */
pr_debug("source address stateless compression\n");
@@ -515,27 +674,52 @@ int lowpan_header_decompress(struct sk_buff *skb, const struct net_device *dev,
if (err)
return -EINVAL;
- /* check for Multicast Compression */
- if (iphc1 & LOWPAN_IPHC_M) {
- if (iphc1 & LOWPAN_IPHC_DAC) {
- pr_debug("dest: context-based mcast compression\n");
- /* TODO: implement this */
- } else {
- err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr,
- iphc1 & LOWPAN_IPHC_DAM_MASK);
+ switch (iphc1 & (LOWPAN_IPHC_M | LOWPAN_IPHC_DAC)) {
+ case LOWPAN_IPHC_M | LOWPAN_IPHC_DAC:
+ spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+ ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid));
+ if (!ci) {
+ spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+ return -EINVAL;
+ }
- if (err)
- return -EINVAL;
+ /* multicast with context */
+ pr_debug("dest: context-based mcast compression\n");
+ err = lowpan_uncompress_multicast_ctx_daddr(skb, ci,
+ &hdr.daddr,
+ iphc1 & LOWPAN_IPHC_DAM_MASK);
+ spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+ break;
+ case LOWPAN_IPHC_M:
+ /* multicast */
+ err = lowpan_uncompress_multicast_daddr(skb, &hdr.daddr,
+ iphc1 & LOWPAN_IPHC_DAM_MASK);
+ break;
+ case LOWPAN_IPHC_DAC:
+ spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+ ci = lowpan_iphc_ctx_get_by_id(dev, LOWPAN_IPHC_CID_DCI(cid));
+ if (!ci) {
+ spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+ return -EINVAL;
}
- } else {
+
+ /* Destination address context based uncompression */
+ pr_debug("DAC bit is set. Handle context based destination address.\n");
+ err = uncompress_ctx_addr(skb, dev, ci, &hdr.daddr,
+ iphc1 & LOWPAN_IPHC_DAM_MASK, daddr);
+ spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+ break;
+ default:
err = uncompress_addr(skb, dev, &hdr.daddr,
iphc1 & LOWPAN_IPHC_DAM_MASK, daddr);
pr_debug("dest: stateless compression mode %d dest %pI6c\n",
iphc1 & LOWPAN_IPHC_DAM_MASK, &hdr.daddr);
- if (err)
- return -EINVAL;
+ break;
}
+ if (err)
+ return -EINVAL;
+
/* Next header data uncompression */
if (iphc0 & LOWPAN_IPHC_NH) {
err = lowpan_nhc_do_uncompression(skb, dev, &hdr);
@@ -585,6 +769,58 @@ static const u8 lowpan_iphc_dam_to_sam_value[] = {
[LOWPAN_IPHC_DAM_11] = LOWPAN_IPHC_SAM_11,
};
+static u8 lowpan_compress_ctx_addr(u8 **hc_ptr, const struct in6_addr *ipaddr,
+ const struct lowpan_iphc_ctx *ctx,
+ const unsigned char *lladdr, bool sam)
+{
+ struct in6_addr tmp = {};
+ u8 dam;
+
+ /* check for SAM/DAM = 11 */
+ memcpy(&tmp.s6_addr[8], lladdr, 8);
+ /* second bit-flip (Universe/Local) is done according RFC2464 */
+ tmp.s6_addr[8] ^= 0x02;
+ /* context information are always used */
+ ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+ if (ipv6_addr_equal(&tmp, ipaddr)) {
+ dam = LOWPAN_IPHC_DAM_11;
+ goto out;
+ }
+
+ memset(&tmp, 0, sizeof(tmp));
+ /* check for SAM/DAM = 10 */
+ tmp.s6_addr[11] = 0xFF;
+ tmp.s6_addr[12] = 0xFE;
+ memcpy(&tmp.s6_addr[14], &ipaddr->s6_addr[14], 2);
+ /* context information are always used */
+ ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+ if (ipv6_addr_equal(&tmp, ipaddr)) {
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[14], 2);
+ dam = LOWPAN_IPHC_DAM_10;
+ goto out;
+ }
+
+ memset(&tmp, 0, sizeof(tmp));
+ /* check for SAM/DAM = 01, should always match */
+ memcpy(&tmp.s6_addr[8], &ipaddr->s6_addr[8], 8);
+ /* context information are always used */
+ ipv6_addr_prefix_copy(&tmp, &ctx->pfx, ctx->plen);
+ if (ipv6_addr_equal(&tmp, ipaddr)) {
+ lowpan_push_hc_data(hc_ptr, &ipaddr->s6_addr[8], 8);
+ dam = LOWPAN_IPHC_DAM_01;
+ goto out;
+ }
+
+ WARN_ONCE(1, "context found but no address mode matched\n");
+ return LOWPAN_IPHC_DAM_00;
+out:
+
+ if (sam)
+ return lowpan_iphc_dam_to_sam_value[dam];
+ else
+ return dam;
+}
+
static u8 lowpan_compress_addr_64(u8 **hc_ptr, const struct in6_addr *ipaddr,
const unsigned char *lladdr, bool sam)
{
@@ -708,6 +944,21 @@ static u8 lowpan_iphc_tf_compress(u8 **hc_ptr, const struct ipv6hdr *hdr)
return val;
}
+static u8 lowpan_iphc_mcast_ctx_addr_compress(u8 **hc_ptr,
+ const struct lowpan_iphc_ctx *ctx,
+ const struct in6_addr *ipaddr)
+{
+ u8 data[6];
+
+ /* flags/scope, reserved (RIID) */
+ memcpy(data, &ipaddr->s6_addr[1], 2);
+ /* group ID */
+ memcpy(&data[1], &ipaddr->s6_addr[11], 4);
+ lowpan_push_hc_data(hc_ptr, data, 6);
+
+ return LOWPAN_IPHC_DAM_00;
+}
+
static u8 lowpan_iphc_mcast_addr_compress(u8 **hc_ptr,
const struct in6_addr *ipaddr)
{
@@ -742,10 +993,11 @@ static u8 lowpan_iphc_mcast_addr_compress(u8 **hc_ptr,
int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
const void *daddr, const void *saddr)
{
- u8 iphc0, iphc1, *hc_ptr;
+ u8 iphc0, iphc1, *hc_ptr, cid = 0;
struct ipv6hdr *hdr;
u8 head[LOWPAN_IPHC_MAX_HC_BUF_LEN] = {};
- int ret, addr_type;
+ struct lowpan_iphc_ctx *dci, *sci, dci_entry, sci_entry;
+ int ret, ipv6_daddr_type, ipv6_saddr_type;
if (skb->protocol != htons(ETH_P_IPV6))
return -EINVAL;
@@ -769,14 +1021,38 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
iphc0 = LOWPAN_DISPATCH_IPHC;
iphc1 = 0;
- /* TODO: context lookup */
-
raw_dump_inline(__func__, "saddr", saddr, EUI64_ADDR_LEN);
raw_dump_inline(__func__, "daddr", daddr, EUI64_ADDR_LEN);
raw_dump_table(__func__, "sending raw skb network uncompressed packet",
skb->data, skb->len);
+ ipv6_daddr_type = ipv6_addr_type(&hdr->daddr);
+ spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+ if (ipv6_daddr_type & IPV6_ADDR_MULTICAST)
+ dci = lowpan_iphc_ctx_get_by_mcast_addr(dev, &hdr->daddr);
+ else
+ dci = lowpan_iphc_ctx_get_by_addr(dev, &hdr->daddr);
+ if (dci) {
+ memcpy(&dci_entry, dci, sizeof(*dci));
+ cid |= dci->id;
+ }
+ spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+
+ spin_lock_bh(&lowpan_priv(dev)->ctx.lock);
+ sci = lowpan_iphc_ctx_get_by_addr(dev, &hdr->saddr);
+ if (sci) {
+ memcpy(&sci_entry, sci, sizeof(*sci));
+ cid |= (sci->id << 4);
+ }
+ spin_unlock_bh(&lowpan_priv(dev)->ctx.lock);
+
+ /* if cid is zero it will be compressed */
+ if (cid) {
+ iphc1 |= LOWPAN_IPHC_CID;
+ lowpan_push_hc_data(&hc_ptr, &cid, sizeof(cid));
+ }
+
/* Traffic Class, Flow Label compression */
iphc0 |= lowpan_iphc_tf_compress(&hc_ptr, hdr);
@@ -813,39 +1089,64 @@ int lowpan_header_compress(struct sk_buff *skb, const struct net_device *dev,
sizeof(hdr->hop_limit));
}
- addr_type = ipv6_addr_type(&hdr->saddr);
+ ipv6_saddr_type = ipv6_addr_type(&hdr->saddr);
/* source address compression */
- if (addr_type == IPV6_ADDR_ANY) {
+ if (ipv6_saddr_type == IPV6_ADDR_ANY) {
pr_debug("source address is unspecified, setting SAC\n");
iphc1 |= LOWPAN_IPHC_SAC;
} else {
- if (addr_type & IPV6_ADDR_LINKLOCAL) {
- iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->saddr,
- saddr, true);
- pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n",
- &hdr->saddr, iphc1);
+ if (sci) {
+ iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->saddr,
+ &sci_entry, saddr,
+ true);
+ iphc1 |= LOWPAN_IPHC_SAC;
} else {
- pr_debug("send the full source address\n");
- lowpan_push_hc_data(&hc_ptr, hdr->saddr.s6_addr, 16);
+ if (ipv6_saddr_type & IPV6_ADDR_LINKLOCAL) {
+ iphc1 |= lowpan_compress_addr_64(&hc_ptr,
+ &hdr->saddr,
+ saddr, true);
+ pr_debug("source address unicast link-local %pI6c iphc1 0x%02x\n",
+ &hdr->saddr, iphc1);
+ } else {
+ pr_debug("send the full source address\n");
+ lowpan_push_hc_data(&hc_ptr,
+ hdr->saddr.s6_addr, 16);
+ }
}
}
- addr_type = ipv6_addr_type(&hdr->daddr);
/* destination address compression */
- if (addr_type & IPV6_ADDR_MULTICAST) {
+ if (ipv6_daddr_type & IPV6_ADDR_MULTICAST) {
pr_debug("destination address is multicast: ");
iphc1 |= LOWPAN_IPHC_M;
- iphc1 |= lowpan_iphc_mcast_addr_compress(&hc_ptr, &hdr->daddr);
+ if (dci) {
+ iphc1 |= lowpan_iphc_mcast_ctx_addr_compress(&hc_ptr,
+ &dci_entry,
+ &hdr->daddr);
+ iphc1 |= LOWPAN_IPHC_DAC;
+ } else {
+ iphc1 |= lowpan_iphc_mcast_addr_compress(&hc_ptr,
+ &hdr->daddr);
+ }
} else {
- if (addr_type & IPV6_ADDR_LINKLOCAL) {
- /* TODO: context lookup */
- iphc1 |= lowpan_compress_addr_64(&hc_ptr, &hdr->daddr,
- daddr, false);
- pr_debug("dest address unicast link-local %pI6c "
- "iphc1 0x%02x\n", &hdr->daddr, iphc1);
+ if (dci) {
+ iphc1 |= lowpan_compress_ctx_addr(&hc_ptr, &hdr->daddr,
+ &dci_entry, daddr,
+ false);
+ iphc1 |= LOWPAN_IPHC_DAC;
} else {
- pr_debug("dest address unicast %pI6c\n", &hdr->daddr);
- lowpan_push_hc_data(&hc_ptr, hdr->daddr.s6_addr, 16);
+ if (ipv6_daddr_type & IPV6_ADDR_LINKLOCAL) {
+ iphc1 |= lowpan_compress_addr_64(&hc_ptr,
+ &hdr->daddr,
+ daddr, false);
+ pr_debug("dest address unicast link-local %pI6c iphc1 0x%02x\n",
+ &hdr->daddr, iphc1);
+ } else {
+ pr_debug("dest address unicast %pI6c\n",
+ &hdr->daddr);
+ lowpan_push_hc_data(&hc_ptr,
+ hdr->daddr.s6_addr, 16);
+ }
}
}
diff --git a/net/6lowpan/nhc_ghc_ext_dest.c b/net/6lowpan/nhc_ghc_ext_dest.c
new file mode 100644
index 000000000000..9887b3a15348
--- /dev/null
+++ b/net/6lowpan/nhc_ghc_ext_dest.c
@@ -0,0 +1,27 @@
+/*
+ * 6LoWPAN Extension Header compression according to RFC7400
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_GHC_EXT_DEST_IDLEN 1
+#define LOWPAN_GHC_EXT_DEST_ID_0 0xb6
+#define LOWPAN_GHC_EXT_DEST_MASK_0 0xfe
+
+static void dest_ghid_setup(struct lowpan_nhc *nhc)
+{
+ nhc->id[0] = LOWPAN_GHC_EXT_DEST_ID_0;
+ nhc->idmask[0] = LOWPAN_GHC_EXT_DEST_MASK_0;
+}
+
+LOWPAN_NHC(ghc_ext_dest, "RFC7400 Destination Extension Header", NEXTHDR_DEST,
+ 0, dest_ghid_setup, LOWPAN_GHC_EXT_DEST_IDLEN, NULL, NULL);
+
+module_lowpan_nhc(ghc_ext_dest);
+MODULE_DESCRIPTION("6LoWPAN generic header destination extension compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_ghc_ext_frag.c b/net/6lowpan/nhc_ghc_ext_frag.c
new file mode 100644
index 000000000000..1308b79e939d
--- /dev/null
+++ b/net/6lowpan/nhc_ghc_ext_frag.c
@@ -0,0 +1,28 @@
+/*
+ * 6LoWPAN Extension Header compression according to RFC7400
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_GHC_EXT_FRAG_IDLEN 1
+#define LOWPAN_GHC_EXT_FRAG_ID_0 0xb4
+#define LOWPAN_GHC_EXT_FRAG_MASK_0 0xfe
+
+static void frag_ghid_setup(struct lowpan_nhc *nhc)
+{
+ nhc->id[0] = LOWPAN_GHC_EXT_FRAG_ID_0;
+ nhc->idmask[0] = LOWPAN_GHC_EXT_FRAG_MASK_0;
+}
+
+LOWPAN_NHC(ghc_ext_frag, "RFC7400 Fragmentation Extension Header",
+ NEXTHDR_FRAGMENT, 0, frag_ghid_setup,
+ LOWPAN_GHC_EXT_FRAG_IDLEN, NULL, NULL);
+
+module_lowpan_nhc(ghc_ext_frag);
+MODULE_DESCRIPTION("6LoWPAN generic header fragmentation extension compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_ghc_ext_hop.c b/net/6lowpan/nhc_ghc_ext_hop.c
new file mode 100644
index 000000000000..baec86fd1974
--- /dev/null
+++ b/net/6lowpan/nhc_ghc_ext_hop.c
@@ -0,0 +1,27 @@
+/*
+ * 6LoWPAN Extension Header compression according to RFC7400
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_GHC_EXT_HOP_IDLEN 1
+#define LOWPAN_GHC_EXT_HOP_ID_0 0xb0
+#define LOWPAN_GHC_EXT_HOP_MASK_0 0xfe
+
+static void hop_ghid_setup(struct lowpan_nhc *nhc)
+{
+ nhc->id[0] = LOWPAN_GHC_EXT_HOP_ID_0;
+ nhc->idmask[0] = LOWPAN_GHC_EXT_HOP_MASK_0;
+}
+
+LOWPAN_NHC(ghc_ext_hop, "RFC7400 Hop-by-Hop Extension Header", NEXTHDR_HOP, 0,
+ hop_ghid_setup, LOWPAN_GHC_EXT_HOP_IDLEN, NULL, NULL);
+
+module_lowpan_nhc(ghc_ext_hop);
+MODULE_DESCRIPTION("6LoWPAN generic header hop-by-hop extension compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_ghc_ext_route.c b/net/6lowpan/nhc_ghc_ext_route.c
new file mode 100644
index 000000000000..d7e5bd791c62
--- /dev/null
+++ b/net/6lowpan/nhc_ghc_ext_route.c
@@ -0,0 +1,27 @@
+/*
+ * 6LoWPAN Extension Header compression according to RFC7400
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_GHC_EXT_ROUTE_IDLEN 1
+#define LOWPAN_GHC_EXT_ROUTE_ID_0 0xb2
+#define LOWPAN_GHC_EXT_ROUTE_MASK_0 0xfe
+
+static void route_ghid_setup(struct lowpan_nhc *nhc)
+{
+ nhc->id[0] = LOWPAN_GHC_EXT_ROUTE_ID_0;
+ nhc->idmask[0] = LOWPAN_GHC_EXT_ROUTE_MASK_0;
+}
+
+LOWPAN_NHC(ghc_ext_route, "RFC7400 Routing Extension Header", NEXTHDR_ROUTING,
+ 0, route_ghid_setup, LOWPAN_GHC_EXT_ROUTE_IDLEN, NULL, NULL);
+
+module_lowpan_nhc(ghc_ext_route);
+MODULE_DESCRIPTION("6LoWPAN generic header routing extension compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_ghc_icmpv6.c b/net/6lowpan/nhc_ghc_icmpv6.c
new file mode 100644
index 000000000000..32e7c2c66bbc
--- /dev/null
+++ b/net/6lowpan/nhc_ghc_icmpv6.c
@@ -0,0 +1,27 @@
+/*
+ * 6LoWPAN ICMPv6 compression according to RFC7400
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_GHC_ICMPV6_IDLEN 1
+#define LOWPAN_GHC_ICMPV6_ID_0 0xdf
+#define LOWPAN_GHC_ICMPV6_MASK_0 0xff
+
+static void icmpv6_ghid_setup(struct lowpan_nhc *nhc)
+{
+ nhc->id[0] = LOWPAN_GHC_ICMPV6_ID_0;
+ nhc->idmask[0] = LOWPAN_GHC_ICMPV6_MASK_0;
+}
+
+LOWPAN_NHC(ghc_icmpv6, "RFC7400 ICMPv6", NEXTHDR_ICMP, 0,
+ icmpv6_ghid_setup, LOWPAN_GHC_ICMPV6_IDLEN, NULL, NULL);
+
+module_lowpan_nhc(ghc_icmpv6);
+MODULE_DESCRIPTION("6LoWPAN generic header ICMPv6 compression");
+MODULE_LICENSE("GPL");
diff --git a/net/6lowpan/nhc_ghc_udp.c b/net/6lowpan/nhc_ghc_udp.c
new file mode 100644
index 000000000000..17beefa52ca8
--- /dev/null
+++ b/net/6lowpan/nhc_ghc_udp.c
@@ -0,0 +1,27 @@
+/*
+ * 6LoWPAN UDP compression according to RFC7400
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+
+#include "nhc.h"
+
+#define LOWPAN_GHC_UDP_IDLEN 1
+#define LOWPAN_GHC_UDP_ID_0 0xd0
+#define LOWPAN_GHC_UDP_MASK_0 0xf8
+
+static void udp_ghid_setup(struct lowpan_nhc *nhc)
+{
+ nhc->id[0] = LOWPAN_GHC_UDP_ID_0;
+ nhc->idmask[0] = LOWPAN_GHC_UDP_MASK_0;
+}
+
+LOWPAN_NHC(ghc_udp, "RFC7400 UDP", NEXTHDR_UDP, 0,
+ udp_ghid_setup, LOWPAN_GHC_UDP_IDLEN, NULL, NULL);
+
+module_lowpan_nhc(ghc_udp);
+MODULE_DESCRIPTION("6LoWPAN generic header UDP compression");
+MODULE_LICENSE("GPL");
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index d2cd9de4b724..a1e273af6fc8 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -261,7 +261,6 @@ static int register_vlan_device(struct net_device *real_dev, u16 vlan_id)
* hope the underlying device can handle it.
*/
new_dev->mtu = real_dev->mtu;
- new_dev->priv_flags |= (real_dev->priv_flags & IFF_UNICAST_FLT);
vlan = vlan_dev_priv(new_dev);
vlan->vlan_proto = htons(ETH_P_8021Q);
@@ -312,6 +311,7 @@ static void vlan_transfer_features(struct net_device *dev,
struct vlan_dev_priv *vlan = vlan_dev_priv(vlandev);
vlandev->gso_max_size = dev->gso_max_size;
+ vlandev->gso_max_segs = dev->gso_max_segs;
if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto))
vlandev->hard_header_len = dev->hard_header_len;
diff --git a/net/8021q/vlan_core.c b/net/8021q/vlan_core.c
index 496b27588493..e2ed69850489 100644
--- a/net/8021q/vlan_core.c
+++ b/net/8021q/vlan_core.c
@@ -30,7 +30,9 @@ bool vlan_do_receive(struct sk_buff **skbp)
skb->pkt_type = PACKET_HOST;
}
- if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR)) {
+ if (!(vlan_dev_priv(vlan_dev)->flags & VLAN_FLAG_REORDER_HDR) &&
+ !netif_is_macvlan_port(vlan_dev) &&
+ !netif_is_bridge_port(vlan_dev)) {
unsigned int offset = skb->data - skb_mac_header(skb);
/*
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index fded86508117..e7e62570bdb8 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -30,6 +30,7 @@
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <net/arp.h>
+#include <net/switchdev.h>
#include "vlan.h"
#include "vlanproc.h"
@@ -542,14 +543,15 @@ static int vlan_dev_init(struct net_device *dev)
(1<<__LINK_STATE_DORMANT))) |
(1<<__LINK_STATE_PRESENT);
- dev->hw_features = NETIF_F_ALL_CSUM | NETIF_F_SG |
+ dev->hw_features = NETIF_F_HW_CSUM | NETIF_F_SG |
NETIF_F_FRAGLIST | NETIF_F_GSO_SOFTWARE |
- NETIF_F_HIGHDMA | NETIF_F_SCTP_CSUM |
+ NETIF_F_HIGHDMA | NETIF_F_SCTP_CRC |
NETIF_F_ALL_FCOE;
dev->features |= real_dev->vlan_features | NETIF_F_LLTX |
NETIF_F_GSO_SOFTWARE;
dev->gso_max_size = real_dev->gso_max_size;
+ dev->gso_max_segs = real_dev->gso_max_segs;
if (dev->features & NETIF_F_VLAN_FEATURES)
netdev_warn(real_dev, "VLAN features are set incorrectly. Q-in-Q configurations may not work correctly.\n");
@@ -620,12 +622,12 @@ static netdev_features_t vlan_dev_fix_features(struct net_device *dev,
return features;
}
-static int vlan_ethtool_get_settings(struct net_device *dev,
- struct ethtool_cmd *cmd)
+static int vlan_ethtool_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *cmd)
{
const struct vlan_dev_priv *vlan = vlan_dev_priv(dev);
- return __ethtool_get_settings(vlan->real_dev, cmd);
+ return __ethtool_get_link_ksettings(vlan->real_dev, cmd);
}
static void vlan_ethtool_get_drvinfo(struct net_device *dev,
@@ -740,7 +742,7 @@ static int vlan_dev_get_iflink(const struct net_device *dev)
}
static const struct ethtool_ops vlan_ethtool_ops = {
- .get_settings = vlan_ethtool_get_settings,
+ .get_link_ksettings = vlan_ethtool_get_link_ksettings,
.get_drvinfo = vlan_ethtool_get_drvinfo,
.get_link = ethtool_op_get_link,
.get_ts_info = vlan_ethtool_get_ts_info,
@@ -774,6 +776,12 @@ static const struct net_device_ops vlan_netdev_ops = {
.ndo_netpoll_cleanup = vlan_dev_netpoll_cleanup,
#endif
.ndo_fix_features = vlan_dev_fix_features,
+ .ndo_fdb_add = switchdev_port_fdb_add,
+ .ndo_fdb_del = switchdev_port_fdb_del,
+ .ndo_fdb_dump = switchdev_port_fdb_dump,
+ .ndo_bridge_setlink = switchdev_port_bridge_setlink,
+ .ndo_bridge_getlink = switchdev_port_bridge_getlink,
+ .ndo_bridge_dellink = switchdev_port_bridge_dellink,
.ndo_get_lock_subclass = vlan_dev_get_lock_subclass,
.ndo_get_iflink = vlan_dev_get_iflink,
};
@@ -792,6 +800,7 @@ void vlan_setup(struct net_device *dev)
ether_setup(dev);
dev->priv_flags |= IFF_802_1Q_VLAN | IFF_NO_QUEUE;
+ dev->priv_flags |= IFF_UNICAST_FLT;
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
netif_keep_dst(dev);
diff --git a/net/8021q/vlanproc.c b/net/8021q/vlanproc.c
index ae63cf72a953..5f1446c9f098 100644
--- a/net/8021q/vlanproc.c
+++ b/net/8021q/vlanproc.c
@@ -184,12 +184,11 @@ int vlan_proc_add_dev(struct net_device *vlandev)
/*
* Delete directory entry for VLAN device.
*/
-int vlan_proc_rem_dev(struct net_device *vlandev)
+void vlan_proc_rem_dev(struct net_device *vlandev)
{
/** NOTE: This will consume the memory pointed to by dent, it seems. */
proc_remove(vlan_dev_priv(vlandev)->dent);
vlan_dev_priv(vlandev)->dent = NULL;
- return 0;
}
/****** Proc filesystem entry points ****************************************/
diff --git a/net/8021q/vlanproc.h b/net/8021q/vlanproc.h
index 063f60a3d5cc..8838a2e92eb6 100644
--- a/net/8021q/vlanproc.h
+++ b/net/8021q/vlanproc.h
@@ -5,7 +5,7 @@
struct net;
int vlan_proc_init(struct net *net);
-int vlan_proc_rem_dev(struct net_device *vlandev);
+void vlan_proc_rem_dev(struct net_device *vlandev);
int vlan_proc_add_dev(struct net_device *vlandev);
void vlan_proc_cleanup(struct net *net);
@@ -14,7 +14,7 @@ void vlan_proc_cleanup(struct net *net);
#define vlan_proc_init(net) (0)
#define vlan_proc_cleanup(net) do {} while (0)
#define vlan_proc_add_dev(dev) ({(void)(dev), 0; })
-#define vlan_proc_rem_dev(dev) ({(void)(dev), 0; })
+#define vlan_proc_rem_dev(dev) do {} while (0)
#endif
#endif /* !(__BEN_VLAN_PROC_INC__) */
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index bced8c074c12..7bc2208b6cc4 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -108,9 +108,7 @@ struct p9_poll_wait {
* @unsent_req_list: accounting for requests that haven't been sent
* @req: current request being processed (if any)
* @tmp_buf: temporary buffer to read in header
- * @rsize: amount to read for current frame
- * @rpos: read position in current frame
- * @rbuf: current read buffer
+ * @rc: temporary fcall for reading current frame
* @wpos: write position for current frame
* @wsize: amount of data to write for current frame
* @wbuf: current write buffer
@@ -131,9 +129,7 @@ struct p9_conn {
struct list_head unsent_req_list;
struct p9_req_t *req;
char tmp_buf[7];
- int rsize;
- int rpos;
- char *rbuf;
+ struct p9_fcall rc;
int wpos;
int wsize;
char *wbuf;
@@ -305,69 +301,77 @@ static void p9_read_work(struct work_struct *work)
if (m->err < 0)
return;
- p9_debug(P9_DEBUG_TRANS, "start mux %p pos %d\n", m, m->rpos);
+ p9_debug(P9_DEBUG_TRANS, "start mux %p pos %zd\n", m, m->rc.offset);
- if (!m->rbuf) {
- m->rbuf = m->tmp_buf;
- m->rpos = 0;
- m->rsize = 7; /* start by reading header */
+ if (!m->rc.sdata) {
+ m->rc.sdata = m->tmp_buf;
+ m->rc.offset = 0;
+ m->rc.capacity = 7; /* start by reading header */
}
clear_bit(Rpending, &m->wsched);
- p9_debug(P9_DEBUG_TRANS, "read mux %p pos %d size: %d = %d\n",
- m, m->rpos, m->rsize, m->rsize-m->rpos);
- err = p9_fd_read(m->client, m->rbuf + m->rpos,
- m->rsize - m->rpos);
+ p9_debug(P9_DEBUG_TRANS, "read mux %p pos %zd size: %zd = %zd\n",
+ m, m->rc.offset, m->rc.capacity,
+ m->rc.capacity - m->rc.offset);
+ err = p9_fd_read(m->client, m->rc.sdata + m->rc.offset,
+ m->rc.capacity - m->rc.offset);
p9_debug(P9_DEBUG_TRANS, "mux %p got %d bytes\n", m, err);
- if (err == -EAGAIN) {
+ if (err == -EAGAIN)
goto end_clear;
- }
if (err <= 0)
goto error;
- m->rpos += err;
+ m->rc.offset += err;
- if ((!m->req) && (m->rpos == m->rsize)) { /* header read in */
- u16 tag;
+ /* header read in */
+ if ((!m->req) && (m->rc.offset == m->rc.capacity)) {
p9_debug(P9_DEBUG_TRANS, "got new header\n");
- n = le32_to_cpu(*(__le32 *) m->rbuf); /* read packet size */
- if (n >= m->client->msize) {
+ err = p9_parse_header(&m->rc, NULL, NULL, NULL, 0);
+ if (err) {
+ p9_debug(P9_DEBUG_ERROR,
+ "error parsing header: %d\n", err);
+ goto error;
+ }
+
+ if (m->rc.size >= m->client->msize) {
p9_debug(P9_DEBUG_ERROR,
- "requested packet size too big: %d\n", n);
+ "requested packet size too big: %d\n",
+ m->rc.size);
err = -EIO;
goto error;
}
- tag = le16_to_cpu(*(__le16 *) (m->rbuf+5)); /* read tag */
p9_debug(P9_DEBUG_TRANS,
- "mux %p pkt: size: %d bytes tag: %d\n", m, n, tag);
+ "mux %p pkt: size: %d bytes tag: %d\n",
+ m, m->rc.size, m->rc.tag);
- m->req = p9_tag_lookup(m->client, tag);
+ m->req = p9_tag_lookup(m->client, m->rc.tag);
if (!m->req || (m->req->status != REQ_STATUS_SENT)) {
p9_debug(P9_DEBUG_ERROR, "Unexpected packet tag %d\n",
- tag);
+ m->rc.tag);
err = -EIO;
goto error;
}
if (m->req->rc == NULL) {
- m->req->rc = kmalloc(sizeof(struct p9_fcall) +
- m->client->msize, GFP_NOFS);
- if (!m->req->rc) {
- m->req = NULL;
- err = -ENOMEM;
- goto error;
- }
+ p9_debug(P9_DEBUG_ERROR,
+ "No recv fcall for tag %d (req %p), disconnecting!\n",
+ m->rc.tag, m->req);
+ m->req = NULL;
+ err = -EIO;
+ goto error;
}
- m->rbuf = (char *)m->req->rc + sizeof(struct p9_fcall);
- memcpy(m->rbuf, m->tmp_buf, m->rsize);
- m->rsize = n;
+ m->rc.sdata = (char *)m->req->rc + sizeof(struct p9_fcall);
+ memcpy(m->rc.sdata, m->tmp_buf, m->rc.capacity);
+ m->rc.capacity = m->rc.size;
}
- /* not an else because some packets (like clunk) have no payload */
- if ((m->req) && (m->rpos == m->rsize)) { /* packet is read in */
+ /* packet is read in
+ * not an else because some packets (like clunk) have no payload
+ */
+ if ((m->req) && (m->rc.offset == m->rc.capacity)) {
p9_debug(P9_DEBUG_TRANS, "got new packet\n");
spin_lock(&m->client->lock);
if (m->req->status != REQ_STATUS_ERROR)
@@ -375,9 +379,9 @@ static void p9_read_work(struct work_struct *work)
list_del(&m->req->req_list);
spin_unlock(&m->client->lock);
p9_client_cb(m->client, m->req, status);
- m->rbuf = NULL;
- m->rpos = 0;
- m->rsize = 0;
+ m->rc.sdata = NULL;
+ m->rc.offset = 0;
+ m->rc.capacity = 0;
m->req = NULL;
}
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index ba1210253f5e..1852e383afd6 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -109,14 +109,13 @@ struct p9_trans_rdma {
/**
* p9_rdma_context - Keeps track of in-process WR
*
- * @wc_op: The original WR op for when the CQE completes in error.
* @busa: Bus address to unmap when the WR completes
* @req: Keeps track of requests (send)
* @rc: Keepts track of replies (receive)
*/
struct p9_rdma_req;
struct p9_rdma_context {
- enum ib_wc_opcode wc_op;
+ struct ib_cqe cqe;
dma_addr_t busa;
union {
struct p9_req_t *req;
@@ -284,9 +283,12 @@ p9_cm_event_handler(struct rdma_cm_id *id, struct rdma_cm_event *event)
}
static void
-handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma,
- struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len)
+recv_done(struct ib_cq *cq, struct ib_wc *wc)
{
+ struct p9_client *client = cq->cq_context;
+ struct p9_trans_rdma *rdma = client->trans;
+ struct p9_rdma_context *c =
+ container_of(wc->wr_cqe, struct p9_rdma_context, cqe);
struct p9_req_t *req;
int err = 0;
int16_t tag;
@@ -295,7 +297,7 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma,
ib_dma_unmap_single(rdma->cm_id->device, c->busa, client->msize,
DMA_FROM_DEVICE);
- if (status != IB_WC_SUCCESS)
+ if (wc->status != IB_WC_SUCCESS)
goto err_out;
err = p9_parse_header(c->rc, NULL, NULL, &tag, 1);
@@ -316,21 +318,32 @@ handle_recv(struct p9_client *client, struct p9_trans_rdma *rdma,
req->rc = c->rc;
p9_client_cb(client, req, REQ_STATUS_RCVD);
+ out:
+ up(&rdma->rq_sem);
+ kfree(c);
return;
err_out:
- p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n", req, err, status);
+ p9_debug(P9_DEBUG_ERROR, "req %p err %d status %d\n",
+ req, err, wc->status);
rdma->state = P9_RDMA_FLUSHING;
client->status = Disconnected;
+ goto out;
}
static void
-handle_send(struct p9_client *client, struct p9_trans_rdma *rdma,
- struct p9_rdma_context *c, enum ib_wc_status status, u32 byte_len)
+send_done(struct ib_cq *cq, struct ib_wc *wc)
{
+ struct p9_client *client = cq->cq_context;
+ struct p9_trans_rdma *rdma = client->trans;
+ struct p9_rdma_context *c =
+ container_of(wc->wr_cqe, struct p9_rdma_context, cqe);
+
ib_dma_unmap_single(rdma->cm_id->device,
c->busa, c->req->tc->size,
DMA_TO_DEVICE);
+ up(&rdma->sq_sem);
+ kfree(c);
}
static void qp_event_handler(struct ib_event *event, void *context)
@@ -339,42 +352,6 @@ static void qp_event_handler(struct ib_event *event, void *context)
event->event, context);
}
-static void cq_comp_handler(struct ib_cq *cq, void *cq_context)
-{
- struct p9_client *client = cq_context;
- struct p9_trans_rdma *rdma = client->trans;
- int ret;
- struct ib_wc wc;
-
- ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP);
- while ((ret = ib_poll_cq(cq, 1, &wc)) > 0) {
- struct p9_rdma_context *c = (void *) (unsigned long) wc.wr_id;
-
- switch (c->wc_op) {
- case IB_WC_RECV:
- handle_recv(client, rdma, c, wc.status, wc.byte_len);
- up(&rdma->rq_sem);
- break;
-
- case IB_WC_SEND:
- handle_send(client, rdma, c, wc.status, wc.byte_len);
- up(&rdma->sq_sem);
- break;
-
- default:
- pr_err("unexpected completion type, c->wc_op=%d, wc.opcode=%d, status=%d\n",
- c->wc_op, wc.opcode, wc.status);
- break;
- }
- kfree(c);
- }
-}
-
-static void cq_event_handler(struct ib_event *e, void *v)
-{
- p9_debug(P9_DEBUG_ERROR, "CQ event %d context %p\n", e->event, v);
-}
-
static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
{
if (!rdma)
@@ -387,7 +364,7 @@ static void rdma_destroy_trans(struct p9_trans_rdma *rdma)
ib_dealloc_pd(rdma->pd);
if (rdma->cq && !IS_ERR(rdma->cq))
- ib_destroy_cq(rdma->cq);
+ ib_free_cq(rdma->cq);
if (rdma->cm_id && !IS_ERR(rdma->cm_id))
rdma_destroy_id(rdma->cm_id);
@@ -408,13 +385,14 @@ post_recv(struct p9_client *client, struct p9_rdma_context *c)
if (ib_dma_mapping_error(rdma->cm_id->device, c->busa))
goto error;
+ c->cqe.done = recv_done;
+
sge.addr = c->busa;
sge.length = client->msize;
sge.lkey = rdma->pd->local_dma_lkey;
wr.next = NULL;
- c->wc_op = IB_WC_RECV;
- wr.wr_id = (unsigned long) c;
+ wr.wr_cqe = &c->cqe;
wr.sg_list = &sge;
wr.num_sge = 1;
return ib_post_recv(rdma->qp, &wr, &bad_wr);
@@ -499,13 +477,14 @@ dont_need_post_recv:
goto send_error;
}
+ c->cqe.done = send_done;
+
sge.addr = c->busa;
sge.length = c->req->tc->size;
sge.lkey = rdma->pd->local_dma_lkey;
wr.next = NULL;
- c->wc_op = IB_WC_SEND;
- wr.wr_id = (unsigned long) c;
+ wr.wr_cqe = &c->cqe;
wr.opcode = IB_WR_SEND;
wr.send_flags = IB_SEND_SIGNALED;
wr.sg_list = &sge;
@@ -642,7 +621,6 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
struct p9_trans_rdma *rdma;
struct rdma_conn_param conn_param;
struct ib_qp_init_attr qp_attr;
- struct ib_cq_init_attr cq_attr = {};
/* Parse the transport specific mount options */
err = parse_opts(args, &opts);
@@ -655,8 +633,8 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
return -ENOMEM;
/* Create the RDMA CM ID */
- rdma->cm_id = rdma_create_id(p9_cm_event_handler, client, RDMA_PS_TCP,
- IB_QPT_RC);
+ rdma->cm_id = rdma_create_id(&init_net, p9_cm_event_handler, client,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(rdma->cm_id))
goto error;
@@ -695,13 +673,11 @@ rdma_create_trans(struct p9_client *client, const char *addr, char *args)
goto error;
/* Create the Completion Queue */
- cq_attr.cqe = opts.sq_depth + opts.rq_depth + 1;
- rdma->cq = ib_create_cq(rdma->cm_id->device, cq_comp_handler,
- cq_event_handler, client,
- &cq_attr);
+ rdma->cq = ib_alloc_cq(rdma->cm_id->device, client,
+ opts.sq_depth + opts.rq_depth + 1,
+ 0, IB_POLL_SOFTIRQ);
if (IS_ERR(rdma->cq))
goto error;
- ib_req_notify_cq(rdma->cq, IB_CQ_NEXT_COMP);
/* Create the Protection Domain */
rdma->pd = ib_alloc_pd(rdma->cm_id->device);
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index 6e70ddb158b4..4acb1d5417aa 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -105,7 +105,7 @@ static struct list_head virtio_chan_list;
/* How many bytes left in this page. */
static unsigned int rest_of_page(void *data)
{
- return PAGE_SIZE - ((unsigned long)data % PAGE_SIZE);
+ return PAGE_SIZE - offset_in_page(data);
}
/**
@@ -143,7 +143,6 @@ static void p9_virtio_close(struct p9_client *client)
static void req_done(struct virtqueue *vq)
{
struct virtio_chan *chan = vq->vdev->priv;
- struct p9_fcall *rc;
unsigned int len;
struct p9_req_t *req;
unsigned long flags;
@@ -152,8 +151,8 @@ static void req_done(struct virtqueue *vq)
while (1) {
spin_lock_irqsave(&chan->lock, flags);
- rc = virtqueue_get_buf(chan->vq, &len);
- if (rc == NULL) {
+ req = virtqueue_get_buf(chan->vq, &len);
+ if (req == NULL) {
spin_unlock_irqrestore(&chan->lock, flags);
break;
}
@@ -161,9 +160,6 @@ static void req_done(struct virtqueue *vq)
spin_unlock_irqrestore(&chan->lock, flags);
/* Wakeup if anyone waiting for VirtIO ring space. */
wake_up(chan->vc_wq);
- p9_debug(P9_DEBUG_TRANS, ": rc %p\n", rc);
- p9_debug(P9_DEBUG_TRANS, ": lookup tag %d\n", rc->tag);
- req = p9_tag_lookup(chan->client, rc->tag);
p9_client_cb(chan->client, req, REQ_STATUS_RCVD);
}
}
@@ -284,7 +280,7 @@ req_retry:
if (in)
sgs[out_sgs + in_sgs++] = chan->sg + out;
- err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc,
+ err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req,
GFP_ATOMIC);
if (err < 0) {
if (err == -ENOSPC) {
@@ -369,7 +365,7 @@ static int p9_get_mapped_pages(struct virtio_chan *chan,
return -ENOMEM;
*need_drop = 0;
- p -= (*offs = (unsigned long)p % PAGE_SIZE);
+ p -= (*offs = offset_in_page(p));
for (index = 0; index < nr_pages; index++) {
if (is_vmalloc_addr(p))
(*pages)[index] = vmalloc_to_page(p);
@@ -469,7 +465,7 @@ req_retry_pinned:
}
BUG_ON(out_sgs + in_sgs > ARRAY_SIZE(sgs));
- err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req->tc,
+ err = virtqueue_add_sgs(chan->vq, sgs, out_sgs, in_sgs, req,
GFP_ATOMIC);
if (err < 0) {
if (err == -ENOSPC) {
@@ -662,7 +658,7 @@ p9_virtio_create(struct p9_client *client, const char *devname, char *args)
mutex_unlock(&virtio_9p_lock);
if (!found) {
- pr_err("no channels available\n");
+ pr_err("no channels available for device %s\n", devname);
return ret;
}
diff --git a/net/Kconfig b/net/Kconfig
index 127da94ae25e..a8934d8c8fda 100644
--- a/net/Kconfig
+++ b/net/Kconfig
@@ -48,6 +48,9 @@ config COMPAT_NETLINK_MESSAGES
config NET_INGRESS
bool
+config NET_EGRESS
+ bool
+
menu "Networking options"
source "net/packet/Kconfig"
@@ -250,9 +253,17 @@ config XPS
depends on SMP
default y
+config HWBM
+ bool
+
+config SOCK_CGROUP_DATA
+ bool
+ default n
+
config CGROUP_NET_PRIO
bool "Network priority cgroup"
depends on CGROUPS
+ select SOCK_CGROUP_DATA
---help---
Cgroup subsystem for use in assigning processes to network priorities on
a per-interface basis.
@@ -260,6 +271,7 @@ config CGROUP_NET_PRIO
config CGROUP_NET_CLASSID
bool "Network classid cgroup"
depends on CGROUPS
+ select SOCK_CGROUP_DATA
---help---
Cgroup subsystem for use as general purpose socket classid marker that is
being used in cls_cgroup and for netfilter matching.
@@ -351,6 +363,7 @@ source "net/can/Kconfig"
source "net/irda/Kconfig"
source "net/bluetooth/Kconfig"
source "net/rxrpc/Kconfig"
+source "net/kcm/Kconfig"
config FIB_RULES
bool
@@ -383,6 +396,26 @@ config LWTUNNEL
weight tunnel endpoint. Tunnel encapsulation parameters are stored
with light weight tunnel state associated with fib routes.
+config DST_CACHE
+ bool
+ default n
+
+config NET_DEVLINK
+ tristate "Network physical/parent device Netlink interface"
+ help
+ Network physical/parent device Netlink interface provides
+ infrastructure to support access to physical chip-wide config and
+ monitoring.
+
+config MAY_USE_DEVLINK
+ tristate
+ default m if NET_DEVLINK=m
+ default y if NET_DEVLINK=y || NET_DEVLINK=n
+ help
+ Drivers using the devlink infrastructure should have a dependency
+ on MAY_USE_DEVLINK to ensure they do not cause link errors when
+ devlink is a loadable module and the driver using it is built-in.
+
endif # if NET
# Used by archs to tell that they support BPF_JIT
diff --git a/net/Makefile b/net/Makefile
index a5d04098dfce..81d14119eab5 100644
--- a/net/Makefile
+++ b/net/Makefile
@@ -34,6 +34,7 @@ obj-$(CONFIG_IRDA) += irda/
obj-$(CONFIG_BT) += bluetooth/
obj-$(CONFIG_SUNRPC) += sunrpc/
obj-$(CONFIG_AF_RXRPC) += rxrpc/
+obj-$(CONFIG_AF_KCM) += kcm/
obj-$(CONFIG_ATM) += atm/
obj-$(CONFIG_L2TP) += l2tp/
obj-$(CONFIG_DECNET) += decnet/
diff --git a/net/appletalk/ddp.c b/net/appletalk/ddp.c
index d5871ac493eb..f066781be3c8 100644
--- a/net/appletalk/ddp.c
+++ b/net/appletalk/ddp.c
@@ -1625,7 +1625,7 @@ static int atalk_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
rt = atrtr_find(&at_hint);
}
- err = ENETUNREACH;
+ err = -ENETUNREACH;
if (!rt)
goto out;
diff --git a/net/atm/common.c b/net/atm/common.c
index 49a872db7e42..6dc12305799e 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -96,7 +96,7 @@ static void vcc_def_wakeup(struct sock *sk)
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up(&wq->wait);
rcu_read_unlock();
}
@@ -117,7 +117,7 @@ static void vcc_write_space(struct sock *sk)
if (vcc_writable(sk)) {
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible(&wq->wait);
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
diff --git a/net/atm/mpc.h b/net/atm/mpc.h
index 0919a88bbc70..cfc7b745aa91 100644
--- a/net/atm/mpc.h
+++ b/net/atm/mpc.h
@@ -21,11 +21,11 @@ struct mpoa_client {
uint8_t our_ctrl_addr[ATM_ESA_LEN]; /* MPC's control ATM address */
rwlock_t ingress_lock;
- struct in_cache_ops *in_ops; /* ingress cache operations */
+ const struct in_cache_ops *in_ops; /* ingress cache operations */
in_cache_entry *in_cache; /* the ingress cache of this MPC */
rwlock_t egress_lock;
- struct eg_cache_ops *eg_ops; /* egress cache operations */
+ const struct eg_cache_ops *eg_ops; /* egress cache operations */
eg_cache_entry *eg_cache; /* the egress cache of this MPC */
uint8_t *mps_macs; /* array of MPS MAC addresses, >=1 */
diff --git a/net/atm/mpoa_caches.c b/net/atm/mpoa_caches.c
index d1b2d9a03144..9e60e74c807d 100644
--- a/net/atm/mpoa_caches.c
+++ b/net/atm/mpoa_caches.c
@@ -534,7 +534,7 @@ static void eg_destroy_cache(struct mpoa_client *mpc)
}
-static struct in_cache_ops ingress_ops = {
+static const struct in_cache_ops ingress_ops = {
in_cache_add_entry, /* add_entry */
in_cache_get, /* get */
in_cache_get_with_mask, /* get_with_mask */
@@ -548,7 +548,7 @@ static struct in_cache_ops ingress_ops = {
in_destroy_cache /* destroy_cache */
};
-static struct eg_cache_ops egress_ops = {
+static const struct eg_cache_ops egress_ops = {
eg_cache_add_entry, /* add_entry */
eg_cache_get_by_cache_id, /* get_by_cache_id */
eg_cache_get_by_tag, /* get_by_tag */
diff --git a/net/ax25/af_ax25.c b/net/ax25/af_ax25.c
index ae3a47f9d1d5..fbd0acf80b13 100644
--- a/net/ax25/af_ax25.c
+++ b/net/ax25/af_ax25.c
@@ -805,6 +805,9 @@ static int ax25_create(struct net *net, struct socket *sock, int protocol,
struct sock *sk;
ax25_cb *ax25;
+ if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+ return -EINVAL;
+
if (!net_eq(net, &init_net))
return -EAFNOSUPPORT;
diff --git a/net/ax25/ax25_ip.c b/net/ax25/ax25_ip.c
index b563a3f5f2a8..2fa3be965101 100644
--- a/net/ax25/ax25_ip.c
+++ b/net/ax25/ax25_ip.c
@@ -228,8 +228,23 @@ netdev_tx_t ax25_ip_xmit(struct sk_buff *skb)
}
#endif
+static bool ax25_validate_header(const char *header, unsigned int len)
+{
+ ax25_digi digi;
+
+ if (!len)
+ return false;
+
+ if (header[0])
+ return true;
+
+ return ax25_addr_parse(header + 1, len - 1, NULL, NULL, &digi, NULL,
+ NULL);
+}
+
const struct header_ops ax25_header_ops = {
.create = ax25_hard_header,
+ .validate = ax25_validate_header,
};
EXPORT_SYMBOL(ax25_header_ops);
diff --git a/net/batman-adv/Kconfig b/net/batman-adv/Kconfig
index c6fc8f756c9a..f66930ee3c0b 100644
--- a/net/batman-adv/Kconfig
+++ b/net/batman-adv/Kconfig
@@ -12,9 +12,23 @@ config BATMAN_ADV
B.A.T.M.A.N. (better approach to mobile ad-hoc networking) is
a routing protocol for multi-hop ad-hoc mesh networks. The
networks may be wired or wireless. See
- http://www.open-mesh.org/ for more information and user space
+ https://www.open-mesh.org/ for more information and user space
tools.
+config BATMAN_ADV_BATMAN_V
+ bool "B.A.T.M.A.N. V protocol (experimental)"
+ depends on BATMAN_ADV && CFG80211=y || (CFG80211=m && BATMAN_ADV=m)
+ default n
+ help
+ This option enables the B.A.T.M.A.N. V protocol, the successor
+ of the currently used B.A.T.M.A.N. IV protocol. The main
+ changes include splitting of the OGM protocol into a neighbor
+ discovery protocol (Echo Location Protocol, ELP) and a new OGM
+ Protocol OGMv2 for flooding protocol information through the
+ network, as well as a throughput based metric.
+ B.A.T.M.A.N. V is currently considered experimental and not
+ compatible to B.A.T.M.A.N. IV networks.
+
config BATMAN_ADV_BLA
bool "Bridge Loop Avoidance"
depends on BATMAN_ADV && INET
diff --git a/net/batman-adv/Makefile b/net/batman-adv/Makefile
index 21434ab79d2c..797cf2fc88c1 100644
--- a/net/batman-adv/Makefile
+++ b/net/batman-adv/Makefile
@@ -1,5 +1,5 @@
#
-# Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+# Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
#
# Marek Lindner, Simon Wunderlich
#
@@ -18,6 +18,9 @@
obj-$(CONFIG_BATMAN_ADV) += batman-adv.o
batman-adv-y += bat_iv_ogm.o
+batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v.o
+batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_elp.o
+batman-adv-$(CONFIG_BATMAN_ADV_BATMAN_V) += bat_v_ogm.o
batman-adv-y += bitarray.o
batman-adv-$(CONFIG_BATMAN_ADV_BLA) += bridge_loop_avoidance.o
batman-adv-$(CONFIG_DEBUG_FS) += debugfs.o
diff --git a/net/batman-adv/bat_algo.h b/net/batman-adv/bat_algo.h
index 4e59cf3eb079..03dafd33d23b 100644
--- a/net/batman-adv/bat_algo.h
+++ b/net/batman-adv/bat_algo.h
@@ -1,6 +1,6 @@
-/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
*
- * Marek Lindner
+ * Marek Lindner, Linus Lüssing
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of version 2 of the GNU General Public
@@ -18,6 +18,32 @@
#ifndef _NET_BATMAN_ADV_BAT_ALGO_H_
#define _NET_BATMAN_ADV_BAT_ALGO_H_
+struct batadv_priv;
+
int batadv_iv_init(void);
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+
+int batadv_v_init(void);
+int batadv_v_mesh_init(struct batadv_priv *bat_priv);
+void batadv_v_mesh_free(struct batadv_priv *bat_priv);
+
+#else
+
+static inline int batadv_v_init(void)
+{
+ return 0;
+}
+
+static inline int batadv_v_mesh_init(struct batadv_priv *bat_priv)
+{
+ return 0;
+}
+
+static inline void batadv_v_mesh_free(struct batadv_priv *bat_priv)
+{
+}
+
+#endif /* CONFIG_BATMAN_ADV_BATMAN_V */
+
#endif /* _NET_BATMAN_ADV_BAT_ALGO_H_ */
diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 912d9c36fb1c..cb2d1b9b0340 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -31,6 +31,7 @@
#include <linux/init.h>
#include <linux/jiffies.h>
#include <linux/list.h>
+#include <linux/kref.h>
#include <linux/netdevice.h>
#include <linux/pkt_sched.h>
#include <linux/printk.h>
@@ -88,7 +89,7 @@ static void batadv_ring_buffer_set(u8 lq_recv[], u8 *lq_index, u8 value)
* in the given ring buffer
* @lq_recv: pointer to the ring buffer
*
- * Returns computed average value.
+ * Return: computed average value.
*/
static u8 batadv_ring_buffer_avg(const u8 lq_recv[])
{
@@ -132,7 +133,7 @@ static void batadv_iv_ogm_orig_free(struct batadv_orig_node *orig_node)
* @orig_node: the orig_node that has to be changed
* @max_if_num: the current amount of interfaces
*
- * Returns 0 on success, a negative error code otherwise.
+ * Return: 0 on success, a negative error code otherwise.
*/
static int batadv_iv_ogm_orig_add_if(struct batadv_orig_node *orig_node,
int max_if_num)
@@ -180,12 +181,13 @@ unlock:
* @max_if_num: the current amount of interfaces
* @del_if_num: the index of the interface being removed
*
- * Returns 0 on success, a negative error code otherwise.
+ * Return: 0 on success, a negative error code otherwise.
*/
static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node,
int max_if_num, int del_if_num)
{
- int chunk_size, ret = -ENOMEM, if_offset;
+ int ret = -ENOMEM;
+ size_t chunk_size, if_offset;
void *data_ptr = NULL;
spin_lock_bh(&orig_node->bat_iv.ogm_cnt_lock);
@@ -203,8 +205,9 @@ static int batadv_iv_ogm_orig_del_if(struct batadv_orig_node *orig_node,
memcpy(data_ptr, orig_node->bat_iv.bcast_own, del_if_num * chunk_size);
/* copy second part */
+ if_offset = (del_if_num + 1) * chunk_size;
memcpy((char *)data_ptr + del_if_num * chunk_size,
- orig_node->bat_iv.bcast_own + ((del_if_num + 1) * chunk_size),
+ (uint8_t *)orig_node->bat_iv.bcast_own + if_offset,
(max_if_num - del_if_num) * chunk_size);
free_bcast_own:
@@ -244,7 +247,7 @@ unlock:
* @bat_priv: the bat priv with all the soft interface information
* @addr: mac address of the originator
*
- * Returns the originator object corresponding to the passed mac address or NULL
+ * Return: the originator object corresponding to the passed mac address or NULL
* on failure.
* If the object does not exists it is created an initialised.
*/
@@ -284,8 +287,8 @@ batadv_iv_ogm_orig_get(struct batadv_priv *bat_priv, const u8 *addr)
free_orig_node:
/* free twice, as batadv_orig_node_new sets refcount to 2 */
- batadv_orig_node_free_ref(orig_node);
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
+ batadv_orig_node_put(orig_node);
return NULL;
}
@@ -361,7 +364,6 @@ batadv_iv_ogm_primary_iface_set(struct batadv_hard_iface *hard_iface)
unsigned char *ogm_buff = hard_iface->bat_iv.ogm_buff;
batadv_ogm_packet = (struct batadv_ogm_packet *)ogm_buff;
- batadv_ogm_packet->flags = BATADV_PRIMARIES_FIRST_HOP;
batadv_ogm_packet->ttl = BATADV_TTL;
}
@@ -395,7 +397,14 @@ static u8 batadv_hop_penalty(u8 tq, const struct batadv_priv *bat_priv)
return new_tq;
}
-/* is there another aggregated packet here? */
+/**
+ * batadv_iv_ogm_aggr_packet - checks if there is another OGM attached
+ * @buff_pos: current position in the skb
+ * @packet_len: total length of the skb
+ * @tvlv_len: tvlv length of the previously considered OGM
+ *
+ * Return: true if there is enough space for another OGM, false otherwise.
+ */
static bool batadv_iv_ogm_aggr_packet(int buff_pos, int packet_len,
__be16 tvlv_len)
{
@@ -469,7 +478,7 @@ static void batadv_iv_ogm_send_to_if(struct batadv_forw_packet *forw_packet,
batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX);
batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES,
skb->len + ETH_HLEN);
- batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr);
+ batadv_send_broadcast_skb(skb, hard_iface);
}
}
@@ -506,7 +515,7 @@ static void batadv_iv_ogm_emit(struct batadv_forw_packet *forw_packet)
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
}
/**
@@ -521,7 +530,7 @@ out:
* @if_outgoing: interface for which the retransmission should be considered
* @forw_packet: the forwarded packet which should be checked
*
- * Returns true if new_packet can be aggregated with forw_packet
+ * Return: true if new_packet can be aggregated with forw_packet
*/
static bool
batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet,
@@ -608,7 +617,7 @@ batadv_iv_ogm_can_aggregate(const struct batadv_ogm_packet *new_bat_ogm_packet,
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
return res;
}
@@ -635,10 +644,10 @@ static void batadv_iv_ogm_aggregate_new(const unsigned char *packet_buff,
unsigned char *skb_buff;
unsigned int skb_size;
- if (!atomic_inc_not_zero(&if_incoming->refcount))
+ if (!kref_get_unless_zero(&if_incoming->refcount))
return;
- if (!atomic_inc_not_zero(&if_outgoing->refcount))
+ if (!kref_get_unless_zero(&if_outgoing->refcount))
goto out_free_incoming;
/* own packet should always be scheduled */
@@ -702,9 +711,9 @@ out_nomem:
if (!own_packet)
atomic_inc(&bat_priv->batman_queue_left);
out_free_outgoing:
- batadv_hardif_free_ref(if_outgoing);
+ batadv_hardif_put(if_outgoing);
out_free_incoming:
- batadv_hardif_free_ref(if_incoming);
+ batadv_hardif_put(if_incoming);
}
/* aggregate a new packet into the existing ogm packet */
@@ -842,8 +851,6 @@ static void batadv_iv_ogm_forward(struct batadv_orig_node *orig_node,
"Forwarding packet: tq: %i, ttl: %i\n",
batadv_ogm_packet->tq, batadv_ogm_packet->ttl);
- /* switch of primaries first hop flag when forwarding */
- batadv_ogm_packet->flags &= ~BATADV_PRIMARIES_FIRST_HOP;
if (is_single_hop_neigh)
batadv_ogm_packet->flags |= BATADV_DIRECTLINK;
else
@@ -951,7 +958,7 @@ static void batadv_iv_ogm_schedule(struct batadv_hard_iface *hard_iface)
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
}
/**
@@ -996,9 +1003,9 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
neigh_addr = tmp_neigh_node->addr;
if (batadv_compare_eth(neigh_addr, ethhdr->h_source) &&
tmp_neigh_node->if_incoming == if_incoming &&
- atomic_inc_not_zero(&tmp_neigh_node->refcount)) {
+ kref_get_unless_zero(&tmp_neigh_node->refcount)) {
if (WARN(neigh_node, "too many matching neigh_nodes"))
- batadv_neigh_node_free_ref(neigh_node);
+ batadv_neigh_node_put(neigh_node);
neigh_node = tmp_neigh_node;
continue;
}
@@ -1019,7 +1026,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
neigh_ifinfo->bat_iv.tq_avg = tq_avg;
spin_unlock_bh(&tmp_neigh_node->ifinfo_lock);
- batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
neigh_ifinfo = NULL;
}
@@ -1034,7 +1041,7 @@ batadv_iv_ogm_orig_update(struct batadv_priv *bat_priv,
ethhdr->h_source,
orig_node, orig_tmp);
- batadv_orig_node_free_ref(orig_tmp);
+ batadv_orig_node_put(orig_tmp);
if (!neigh_node)
goto unlock;
} else {
@@ -1109,13 +1116,13 @@ unlock:
rcu_read_unlock();
out:
if (neigh_node)
- batadv_neigh_node_free_ref(neigh_node);
+ batadv_neigh_node_put(neigh_node);
if (router)
- batadv_neigh_node_free_ref(router);
+ batadv_neigh_node_put(router);
if (neigh_ifinfo)
- batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
if (router_ifinfo)
- batadv_neigh_ifinfo_free_ref(router_ifinfo);
+ batadv_neigh_ifinfo_put(router_ifinfo);
}
/**
@@ -1126,7 +1133,7 @@ out:
* @if_incoming: interface where the packet was received
* @if_outgoing: interface for which the retransmission should be considered
*
- * Returns 1 if the link can be considered bidirectional, 0 otherwise
+ * Return: 1 if the link can be considered bidirectional, 0 otherwise
*/
static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
struct batadv_orig_node *orig_neigh_node,
@@ -1155,7 +1162,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
if (tmp_neigh_node->if_incoming != if_incoming)
continue;
- if (!atomic_inc_not_zero(&tmp_neigh_node->refcount))
+ if (!kref_get_unless_zero(&tmp_neigh_node->refcount))
continue;
neigh_node = tmp_neigh_node;
@@ -1185,7 +1192,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing);
if (neigh_ifinfo) {
neigh_rq_count = neigh_ifinfo->bat_iv.real_packet_count;
- batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
} else {
neigh_rq_count = 0;
}
@@ -1258,7 +1265,7 @@ static int batadv_iv_ogm_calc_tq(struct batadv_orig_node *orig_node,
out:
if (neigh_node)
- batadv_neigh_node_free_ref(neigh_node);
+ batadv_neigh_node_put(neigh_node);
return ret;
}
@@ -1270,7 +1277,7 @@ out:
* @if_incoming: interface on which the OGM packet was received
* @if_outgoing: interface for which the retransmission should be considered
*
- * Returns duplicate status as enum batadv_dup_status
+ * Return: duplicate status as enum batadv_dup_status
*/
static enum batadv_dup_status
batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
@@ -1299,7 +1306,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing);
if (WARN_ON(!orig_ifinfo)) {
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
return 0;
}
@@ -1309,7 +1316,8 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
/* signalize caller that the packet is to be dropped. */
if (!hlist_empty(&orig_node->neigh_list) &&
batadv_window_protected(bat_priv, seq_diff,
- &orig_ifinfo->batman_seqno_reset)) {
+ BATADV_TQ_LOCAL_WINDOW_SIZE,
+ &orig_ifinfo->batman_seqno_reset, NULL)) {
ret = BATADV_PROTECTED;
goto out;
}
@@ -1345,7 +1353,7 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
packet_count = bitmap_weight(bitmap,
BATADV_TQ_LOCAL_WINDOW_SIZE);
neigh_ifinfo->bat_iv.real_packet_count = packet_count;
- batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
}
rcu_read_unlock();
@@ -1359,8 +1367,8 @@ batadv_iv_ogm_update_seqnos(const struct ethhdr *ethhdr,
out:
spin_unlock_bh(&orig_node->bat_iv.ogm_cnt_lock);
- batadv_orig_node_free_ref(orig_node);
- batadv_orig_ifinfo_free_ref(orig_ifinfo);
+ batadv_orig_node_put(orig_node);
+ batadv_orig_ifinfo_put(orig_ifinfo);
return ret;
}
@@ -1379,6 +1387,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
struct batadv_hard_iface *if_outgoing)
{
struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_hardif_neigh_node *hardif_neigh = NULL;
struct batadv_neigh_node *router = NULL;
struct batadv_neigh_node *router_router = NULL;
struct batadv_orig_node *orig_neigh_node;
@@ -1423,6 +1432,13 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
goto out;
}
+ if (is_single_hop_neigh) {
+ hardif_neigh = batadv_hardif_neigh_get(if_incoming,
+ ethhdr->h_source);
+ if (hardif_neigh)
+ hardif_neigh->last_seen = jiffies;
+ }
+
router = batadv_orig_router_get(orig_node, if_outgoing);
if (router) {
router_router = batadv_orig_router_get(router->orig_node,
@@ -1498,7 +1514,7 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
ogm_packet, if_incoming,
if_outgoing, dup_status);
}
- batadv_orig_ifinfo_free_ref(orig_ifinfo);
+ batadv_orig_ifinfo_put(orig_ifinfo);
/* only forward for specific interface, not for the default one. */
if (if_outgoing == BATADV_IF_DEFAULT)
@@ -1547,16 +1563,18 @@ batadv_iv_ogm_process_per_outif(const struct sk_buff *skb, int ogm_offset,
out_neigh:
if ((orig_neigh_node) && (!is_single_hop_neigh))
- batadv_orig_node_free_ref(orig_neigh_node);
+ batadv_orig_node_put(orig_neigh_node);
out:
if (router_ifinfo)
- batadv_neigh_ifinfo_free_ref(router_ifinfo);
+ batadv_neigh_ifinfo_put(router_ifinfo);
if (router)
- batadv_neigh_node_free_ref(router);
+ batadv_neigh_node_put(router);
if (router_router)
- batadv_neigh_node_free_ref(router_router);
+ batadv_neigh_node_put(router_router);
if (orig_neigh_router)
- batadv_neigh_node_free_ref(orig_neigh_router);
+ batadv_neigh_node_put(orig_neigh_router);
+ if (hardif_neigh)
+ batadv_hardif_neigh_put(hardif_neigh);
kfree_skb(skb_priv);
}
@@ -1679,7 +1697,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"Drop packet: originator packet from myself (via neighbor)\n");
- batadv_orig_node_free_ref(orig_neigh_node);
+ batadv_orig_node_put(orig_neigh_node);
return;
}
@@ -1717,7 +1735,7 @@ static void batadv_iv_ogm_process(const struct sk_buff *skb, int ogm_offset,
}
rcu_read_unlock();
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
}
static int batadv_iv_ogm_receive(struct sk_buff *skb,
@@ -1787,7 +1805,7 @@ batadv_iv_ogm_orig_print_neigh(struct batadv_orig_node *orig_node,
neigh_node->addr,
n_ifinfo->bat_iv.tq_avg);
- batadv_neigh_ifinfo_free_ref(n_ifinfo);
+ batadv_neigh_ifinfo_put(n_ifinfo);
}
}
@@ -1850,9 +1868,9 @@ static void batadv_iv_ogm_orig_print(struct batadv_priv *bat_priv,
batman_count++;
next:
- batadv_neigh_node_free_ref(neigh_node);
+ batadv_neigh_node_put(neigh_node);
if (n_ifinfo)
- batadv_neigh_ifinfo_free_ref(n_ifinfo);
+ batadv_neigh_ifinfo_put(n_ifinfo);
}
rcu_read_unlock();
}
@@ -1862,13 +1880,65 @@ next:
}
/**
+ * batadv_iv_hardif_neigh_print - print a single hop neighbour node
+ * @seq: neighbour table seq_file struct
+ * @hardif_neigh: hardif neighbour information
+ */
+static void
+batadv_iv_hardif_neigh_print(struct seq_file *seq,
+ struct batadv_hardif_neigh_node *hardif_neigh)
+{
+ int last_secs, last_msecs;
+
+ last_secs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) / 1000;
+ last_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) % 1000;
+
+ seq_printf(seq, " %10s %pM %4i.%03is\n",
+ hardif_neigh->if_incoming->net_dev->name,
+ hardif_neigh->addr, last_secs, last_msecs);
+}
+
+/**
+ * batadv_iv_ogm_neigh_print - print the single hop neighbour list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @seq: neighbour table seq_file struct
+ */
+static void batadv_iv_neigh_print(struct batadv_priv *bat_priv,
+ struct seq_file *seq)
+{
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_hardif_neigh_node *hardif_neigh;
+ struct batadv_hard_iface *hard_iface;
+ int batman_count = 0;
+
+ seq_printf(seq, " %10s %-13s %s\n",
+ "IF", "Neighbor", "last-seen");
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->soft_iface != net_dev)
+ continue;
+
+ hlist_for_each_entry_rcu(hardif_neigh,
+ &hard_iface->neigh_list, list) {
+ batadv_iv_hardif_neigh_print(seq, hardif_neigh);
+ batman_count++;
+ }
+ }
+ rcu_read_unlock();
+
+ if (batman_count == 0)
+ seq_puts(seq, "No batman nodes in range ...\n");
+}
+
+/**
* batadv_iv_ogm_neigh_cmp - compare the metrics of two neighbors
* @neigh1: the first neighbor object of the comparison
* @if_outgoing1: outgoing interface for the first neighbor
* @neigh2: the second neighbor object of the comparison
* @if_outgoing2: outgoing interface for the second neighbor
*
- * Returns a value less, equal to or greater than 0 if the metric via neigh1 is
+ * Return: a value less, equal to or greater than 0 if the metric via neigh1 is
* lower, the same as or higher than the metric via neigh2
*/
static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1,
@@ -1894,26 +1964,26 @@ static int batadv_iv_ogm_neigh_cmp(struct batadv_neigh_node *neigh1,
out:
if (neigh1_ifinfo)
- batadv_neigh_ifinfo_free_ref(neigh1_ifinfo);
+ batadv_neigh_ifinfo_put(neigh1_ifinfo);
if (neigh2_ifinfo)
- batadv_neigh_ifinfo_free_ref(neigh2_ifinfo);
+ batadv_neigh_ifinfo_put(neigh2_ifinfo);
return diff;
}
/**
- * batadv_iv_ogm_neigh_is_eob - check if neigh1 is equally good or better than
- * neigh2 from the metric prospective
+ * batadv_iv_ogm_neigh_is_sob - check if neigh1 is similarly good or better
+ * than neigh2 from the metric prospective
* @neigh1: the first neighbor object of the comparison
* @if_outgoing1: outgoing interface for the first neighbor
* @neigh2: the second neighbor object of the comparison
* @if_outgoing2: outgoing interface for the second neighbor
*
- * Returns true if the metric via neigh1 is equally good or better than
+ * Return: true if the metric via neigh1 is equally good or better than
* the metric via neigh2, false otherwise.
*/
static bool
-batadv_iv_ogm_neigh_is_eob(struct batadv_neigh_node *neigh1,
+batadv_iv_ogm_neigh_is_sob(struct batadv_neigh_node *neigh1,
struct batadv_hard_iface *if_outgoing1,
struct batadv_neigh_node *neigh2,
struct batadv_hard_iface *if_outgoing2)
@@ -1937,9 +2007,9 @@ batadv_iv_ogm_neigh_is_eob(struct batadv_neigh_node *neigh1,
out:
if (neigh1_ifinfo)
- batadv_neigh_ifinfo_free_ref(neigh1_ifinfo);
+ batadv_neigh_ifinfo_put(neigh1_ifinfo);
if (neigh2_ifinfo)
- batadv_neigh_ifinfo_free_ref(neigh2_ifinfo);
+ batadv_neigh_ifinfo_put(neigh2_ifinfo);
return ret;
}
@@ -1953,7 +2023,8 @@ static struct batadv_algo_ops batadv_batman_iv __read_mostly = {
.bat_ogm_schedule = batadv_iv_ogm_schedule,
.bat_ogm_emit = batadv_iv_ogm_emit,
.bat_neigh_cmp = batadv_iv_ogm_neigh_cmp,
- .bat_neigh_is_equiv_or_better = batadv_iv_ogm_neigh_is_eob,
+ .bat_neigh_is_similar_or_better = batadv_iv_ogm_neigh_is_sob,
+ .bat_neigh_print = batadv_iv_neigh_print,
.bat_orig_print = batadv_iv_ogm_orig_print,
.bat_orig_free = batadv_iv_ogm_orig_free,
.bat_orig_add_if = batadv_iv_ogm_orig_add_if,
diff --git a/net/batman-adv/bat_v.c b/net/batman-adv/bat_v.c
new file mode 100644
index 000000000000..3315b9a598af
--- /dev/null
+++ b/net/batman-adv/bat_v.c
@@ -0,0 +1,347 @@
+/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+ *
+ * Linus Lüssing, Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "bat_algo.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/bug.h>
+#include <linux/cache.h>
+#include <linux/init.h>
+#include <linux/jiffies.h>
+#include <linux/netdevice.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/seq_file.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "bat_v_elp.h"
+#include "bat_v_ogm.h"
+#include "hash.h"
+#include "originator.h"
+#include "packet.h"
+
+static int batadv_v_iface_enable(struct batadv_hard_iface *hard_iface)
+{
+ int ret;
+
+ ret = batadv_v_elp_iface_enable(hard_iface);
+ if (ret < 0)
+ return ret;
+
+ ret = batadv_v_ogm_iface_enable(hard_iface);
+ if (ret < 0)
+ batadv_v_elp_iface_disable(hard_iface);
+
+ /* enable link throughput auto-detection by setting the throughput
+ * override to zero
+ */
+ atomic_set(&hard_iface->bat_v.throughput_override, 0);
+
+ return ret;
+}
+
+static void batadv_v_iface_disable(struct batadv_hard_iface *hard_iface)
+{
+ batadv_v_elp_iface_disable(hard_iface);
+}
+
+static void batadv_v_iface_update_mac(struct batadv_hard_iface *hard_iface)
+{
+}
+
+static void batadv_v_primary_iface_set(struct batadv_hard_iface *hard_iface)
+{
+ batadv_v_elp_primary_iface_set(hard_iface);
+ batadv_v_ogm_primary_iface_set(hard_iface);
+}
+
+static void
+batadv_v_hardif_neigh_init(struct batadv_hardif_neigh_node *hardif_neigh)
+{
+ ewma_throughput_init(&hardif_neigh->bat_v.throughput);
+ INIT_WORK(&hardif_neigh->bat_v.metric_work,
+ batadv_v_elp_throughput_metric_update);
+}
+
+static void batadv_v_ogm_schedule(struct batadv_hard_iface *hard_iface)
+{
+}
+
+static void batadv_v_ogm_emit(struct batadv_forw_packet *forw_packet)
+{
+}
+
+/**
+ * batadv_v_orig_print_neigh - print neighbors for the originator table
+ * @orig_node: the orig_node for which the neighbors are printed
+ * @if_outgoing: outgoing interface for these entries
+ * @seq: debugfs table seq_file struct
+ *
+ * Must be called while holding an rcu lock.
+ */
+static void
+batadv_v_orig_print_neigh(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *if_outgoing,
+ struct seq_file *seq)
+{
+ struct batadv_neigh_node *neigh_node;
+ struct batadv_neigh_ifinfo *n_ifinfo;
+
+ hlist_for_each_entry_rcu(neigh_node, &orig_node->neigh_list, list) {
+ n_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+ if (!n_ifinfo)
+ continue;
+
+ seq_printf(seq, " %pM (%9u.%1u)",
+ neigh_node->addr,
+ n_ifinfo->bat_v.throughput / 10,
+ n_ifinfo->bat_v.throughput % 10);
+
+ batadv_neigh_ifinfo_put(n_ifinfo);
+ }
+}
+
+/**
+ * batadv_v_hardif_neigh_print - print a single ELP neighbour node
+ * @seq: neighbour table seq_file struct
+ * @hardif_neigh: hardif neighbour information
+ */
+static void
+batadv_v_hardif_neigh_print(struct seq_file *seq,
+ struct batadv_hardif_neigh_node *hardif_neigh)
+{
+ int last_secs, last_msecs;
+ u32 throughput;
+
+ last_secs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) / 1000;
+ last_msecs = jiffies_to_msecs(jiffies - hardif_neigh->last_seen) % 1000;
+ throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput);
+
+ seq_printf(seq, "%pM %4i.%03is (%9u.%1u) [%10s]\n",
+ hardif_neigh->addr, last_secs, last_msecs, throughput / 10,
+ throughput % 10, hardif_neigh->if_incoming->net_dev->name);
+}
+
+/**
+ * batadv_v_neigh_print - print the single hop neighbour list
+ * @bat_priv: the bat priv with all the soft interface information
+ * @seq: neighbour table seq_file struct
+ */
+static void batadv_v_neigh_print(struct batadv_priv *bat_priv,
+ struct seq_file *seq)
+{
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_hardif_neigh_node *hardif_neigh;
+ struct batadv_hard_iface *hard_iface;
+ int batman_count = 0;
+
+ seq_printf(seq, " %-15s %s (%11s) [%10s]\n", "Neighbor",
+ "last-seen", "throughput", "IF");
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->soft_iface != net_dev)
+ continue;
+
+ hlist_for_each_entry_rcu(hardif_neigh,
+ &hard_iface->neigh_list, list) {
+ batadv_v_hardif_neigh_print(seq, hardif_neigh);
+ batman_count++;
+ }
+ }
+ rcu_read_unlock();
+
+ if (batman_count == 0)
+ seq_puts(seq, "No batman nodes in range ...\n");
+}
+
+/**
+ * batadv_v_orig_print - print the originator table
+ * @bat_priv: the bat priv with all the soft interface information
+ * @seq: debugfs table seq_file struct
+ * @if_outgoing: the outgoing interface for which this should be printed
+ */
+static void batadv_v_orig_print(struct batadv_priv *bat_priv,
+ struct seq_file *seq,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_neigh_node *neigh_node;
+ struct batadv_hashtable *hash = bat_priv->orig_hash;
+ int last_seen_msecs, last_seen_secs;
+ struct batadv_orig_node *orig_node;
+ struct batadv_neigh_ifinfo *n_ifinfo;
+ unsigned long last_seen_jiffies;
+ struct hlist_head *head;
+ int batman_count = 0;
+ u32 i;
+
+ seq_printf(seq, " %-15s %s (%11s) %17s [%10s]: %20s ...\n",
+ "Originator", "last-seen", "throughput", "Nexthop",
+ "outgoingIF", "Potential nexthops");
+
+ for (i = 0; i < hash->size; i++) {
+ head = &hash->table[i];
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(orig_node, head, hash_entry) {
+ neigh_node = batadv_orig_router_get(orig_node,
+ if_outgoing);
+ if (!neigh_node)
+ continue;
+
+ n_ifinfo = batadv_neigh_ifinfo_get(neigh_node,
+ if_outgoing);
+ if (!n_ifinfo)
+ goto next;
+
+ last_seen_jiffies = jiffies - orig_node->last_seen;
+ last_seen_msecs = jiffies_to_msecs(last_seen_jiffies);
+ last_seen_secs = last_seen_msecs / 1000;
+ last_seen_msecs = last_seen_msecs % 1000;
+
+ seq_printf(seq, "%pM %4i.%03is (%9u.%1u) %pM [%10s]:",
+ orig_node->orig, last_seen_secs,
+ last_seen_msecs,
+ n_ifinfo->bat_v.throughput / 10,
+ n_ifinfo->bat_v.throughput % 10,
+ neigh_node->addr,
+ neigh_node->if_incoming->net_dev->name);
+
+ batadv_v_orig_print_neigh(orig_node, if_outgoing, seq);
+ seq_puts(seq, "\n");
+ batman_count++;
+
+next:
+ batadv_neigh_node_put(neigh_node);
+ if (n_ifinfo)
+ batadv_neigh_ifinfo_put(n_ifinfo);
+ }
+ rcu_read_unlock();
+ }
+
+ if (batman_count == 0)
+ seq_puts(seq, "No batman nodes in range ...\n");
+}
+
+static int batadv_v_neigh_cmp(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2)
+{
+ struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2;
+
+ ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
+ ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+
+ if (WARN_ON(!ifinfo1 || !ifinfo2))
+ return 0;
+
+ return ifinfo1->bat_v.throughput - ifinfo2->bat_v.throughput;
+}
+
+static bool batadv_v_neigh_is_sob(struct batadv_neigh_node *neigh1,
+ struct batadv_hard_iface *if_outgoing1,
+ struct batadv_neigh_node *neigh2,
+ struct batadv_hard_iface *if_outgoing2)
+{
+ struct batadv_neigh_ifinfo *ifinfo1, *ifinfo2;
+ u32 threshold;
+
+ ifinfo1 = batadv_neigh_ifinfo_get(neigh1, if_outgoing1);
+ ifinfo2 = batadv_neigh_ifinfo_get(neigh2, if_outgoing2);
+
+ threshold = ifinfo1->bat_v.throughput / 4;
+ threshold = ifinfo1->bat_v.throughput - threshold;
+
+ return ifinfo2->bat_v.throughput > threshold;
+}
+
+static struct batadv_algo_ops batadv_batman_v __read_mostly = {
+ .name = "BATMAN_V",
+ .bat_iface_enable = batadv_v_iface_enable,
+ .bat_iface_disable = batadv_v_iface_disable,
+ .bat_iface_update_mac = batadv_v_iface_update_mac,
+ .bat_primary_iface_set = batadv_v_primary_iface_set,
+ .bat_hardif_neigh_init = batadv_v_hardif_neigh_init,
+ .bat_ogm_emit = batadv_v_ogm_emit,
+ .bat_ogm_schedule = batadv_v_ogm_schedule,
+ .bat_orig_print = batadv_v_orig_print,
+ .bat_neigh_cmp = batadv_v_neigh_cmp,
+ .bat_neigh_is_similar_or_better = batadv_v_neigh_is_sob,
+ .bat_neigh_print = batadv_v_neigh_print,
+};
+
+/**
+ * batadv_v_mesh_init - initialize the B.A.T.M.A.N. V private resources for a
+ * mesh
+ * @bat_priv: the object representing the mesh interface to initialise
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+int batadv_v_mesh_init(struct batadv_priv *bat_priv)
+{
+ return batadv_v_ogm_init(bat_priv);
+}
+
+/**
+ * batadv_v_mesh_free - free the B.A.T.M.A.N. V private resources for a mesh
+ * @bat_priv: the object representing the mesh interface to free
+ */
+void batadv_v_mesh_free(struct batadv_priv *bat_priv)
+{
+ batadv_v_ogm_free(bat_priv);
+}
+
+/**
+ * batadv_v_init - B.A.T.M.A.N. V initialization function
+ *
+ * Description: Takes care of initializing all the subcomponents.
+ * It is invoked upon module load only.
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+int __init batadv_v_init(void)
+{
+ int ret;
+
+ /* B.A.T.M.A.N. V echo location protocol packet */
+ ret = batadv_recv_handler_register(BATADV_ELP,
+ batadv_v_elp_packet_recv);
+ if (ret < 0)
+ return ret;
+
+ ret = batadv_recv_handler_register(BATADV_OGM2,
+ batadv_v_ogm_packet_recv);
+ if (ret < 0)
+ goto elp_unregister;
+
+ ret = batadv_algo_register(&batadv_batman_v);
+ if (ret < 0)
+ goto ogm_unregister;
+
+ return ret;
+
+ogm_unregister:
+ batadv_recv_handler_unregister(BATADV_OGM2);
+
+elp_unregister:
+ batadv_recv_handler_unregister(BATADV_ELP);
+
+ return ret;
+}
diff --git a/net/batman-adv/bat_v_elp.c b/net/batman-adv/bat_v_elp.c
new file mode 100644
index 000000000000..3844e7efd0b0
--- /dev/null
+++ b/net/batman-adv/bat_v_elp.c
@@ -0,0 +1,515 @@
+/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
+ *
+ * Linus Lüssing, Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "bat_v_elp.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/ethtool.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/kref.h>
+#include <linux/netdevice.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/rtnetlink.h>
+#include <linux/skbuff.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+#include <net/cfg80211.h>
+
+#include "bat_algo.h"
+#include "bat_v_ogm.h"
+#include "hard-interface.h"
+#include "originator.h"
+#include "packet.h"
+#include "routing.h"
+#include "send.h"
+
+/**
+ * batadv_v_elp_start_timer - restart timer for ELP periodic work
+ * @hard_iface: the interface for which the timer has to be reset
+ */
+static void batadv_v_elp_start_timer(struct batadv_hard_iface *hard_iface)
+{
+ unsigned int msecs;
+
+ msecs = atomic_read(&hard_iface->bat_v.elp_interval) - BATADV_JITTER;
+ msecs += prandom_u32() % (2 * BATADV_JITTER);
+
+ queue_delayed_work(batadv_event_workqueue, &hard_iface->bat_v.elp_wq,
+ msecs_to_jiffies(msecs));
+}
+
+/**
+ * batadv_v_elp_get_throughput - get the throughput towards a neighbour
+ * @neigh: the neighbour for which the throughput has to be obtained
+ *
+ * Return: The throughput towards the given neighbour in multiples of 100kpbs
+ * (a value of '1' equals to 0.1Mbps, '10' equals 1Mbps, etc).
+ */
+static u32 batadv_v_elp_get_throughput(struct batadv_hardif_neigh_node *neigh)
+{
+ struct batadv_hard_iface *hard_iface = neigh->if_incoming;
+ struct ethtool_link_ksettings link_settings;
+ struct station_info sinfo;
+ u32 throughput;
+ int ret;
+
+ /* if the user specified a customised value for this interface, then
+ * return it directly
+ */
+ throughput = atomic_read(&hard_iface->bat_v.throughput_override);
+ if (throughput != 0)
+ return throughput;
+
+ /* if this is a wireless device, then ask its throughput through
+ * cfg80211 API
+ */
+ if (batadv_is_wifi_netdev(hard_iface->net_dev)) {
+ if (hard_iface->net_dev->ieee80211_ptr) {
+ ret = cfg80211_get_station(hard_iface->net_dev,
+ neigh->addr, &sinfo);
+ if (ret == -ENOENT) {
+ /* Node is not associated anymore! It would be
+ * possible to delete this neighbor. For now set
+ * the throughput metric to 0.
+ */
+ return 0;
+ }
+ if (!ret)
+ return sinfo.expected_throughput / 100;
+ }
+
+ /* unsupported WiFi driver version */
+ goto default_throughput;
+ }
+
+ /* if not a wifi interface, check if this device provides data via
+ * ethtool (e.g. an Ethernet adapter)
+ */
+ memset(&link_settings, 0, sizeof(link_settings));
+ rtnl_lock();
+ ret = __ethtool_get_link_ksettings(hard_iface->net_dev, &link_settings);
+ rtnl_unlock();
+ if (ret == 0) {
+ /* link characteristics might change over time */
+ if (link_settings.base.duplex == DUPLEX_FULL)
+ hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX;
+ else
+ hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX;
+
+ throughput = link_settings.base.speed;
+ if (throughput && (throughput != SPEED_UNKNOWN))
+ return throughput * 10;
+ }
+
+default_throughput:
+ if (!(hard_iface->bat_v.flags & BATADV_WARNING_DEFAULT)) {
+ batadv_info(hard_iface->soft_iface,
+ "WiFi driver or ethtool info does not provide information about link speeds on interface %s, therefore defaulting to hardcoded throughput values of %u.%1u Mbps. Consider overriding the throughput manually or checking your driver.\n",
+ hard_iface->net_dev->name,
+ BATADV_THROUGHPUT_DEFAULT_VALUE / 10,
+ BATADV_THROUGHPUT_DEFAULT_VALUE % 10);
+ hard_iface->bat_v.flags |= BATADV_WARNING_DEFAULT;
+ }
+
+ /* if none of the above cases apply, return the base_throughput */
+ return BATADV_THROUGHPUT_DEFAULT_VALUE;
+}
+
+/**
+ * batadv_v_elp_throughput_metric_update - worker updating the throughput metric
+ * of a single hop neighbour
+ * @work: the work queue item
+ */
+void batadv_v_elp_throughput_metric_update(struct work_struct *work)
+{
+ struct batadv_hardif_neigh_node_bat_v *neigh_bat_v;
+ struct batadv_hardif_neigh_node *neigh;
+
+ neigh_bat_v = container_of(work, struct batadv_hardif_neigh_node_bat_v,
+ metric_work);
+ neigh = container_of(neigh_bat_v, struct batadv_hardif_neigh_node,
+ bat_v);
+
+ ewma_throughput_add(&neigh->bat_v.throughput,
+ batadv_v_elp_get_throughput(neigh));
+
+ /* decrement refcounter to balance increment performed before scheduling
+ * this task
+ */
+ batadv_hardif_neigh_put(neigh);
+}
+
+/**
+ * batadv_v_elp_wifi_neigh_probe - send link probing packets to a neighbour
+ * @neigh: the neighbour to probe
+ *
+ * Sends a predefined number of unicast wifi packets to a given neighbour in
+ * order to trigger the throughput estimation on this link by the RC algorithm.
+ * Packets are sent only if there there is not enough payload unicast traffic
+ * towards this neighbour..
+ *
+ * Return: True on success and false in case of error during skb preparation.
+ */
+static bool
+batadv_v_elp_wifi_neigh_probe(struct batadv_hardif_neigh_node *neigh)
+{
+ struct batadv_hard_iface *hard_iface = neigh->if_incoming;
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ unsigned long last_tx_diff;
+ struct sk_buff *skb;
+ int probe_len, i;
+ int elp_skb_len;
+
+ /* this probing routine is for Wifi neighbours only */
+ if (!batadv_is_wifi_netdev(hard_iface->net_dev))
+ return true;
+
+ /* probe the neighbor only if no unicast packets have been sent
+ * to it in the last 100 milliseconds: this is the rate control
+ * algorithm sampling interval (minstrel). In this way, if not
+ * enough traffic has been sent to the neighbor, batman-adv can
+ * generate 2 probe packets and push the RC algorithm to perform
+ * the sampling
+ */
+ last_tx_diff = jiffies_to_msecs(jiffies - neigh->bat_v.last_unicast_tx);
+ if (last_tx_diff <= BATADV_ELP_PROBE_MAX_TX_DIFF)
+ return true;
+
+ probe_len = max_t(int, sizeof(struct batadv_elp_packet),
+ BATADV_ELP_MIN_PROBE_SIZE);
+
+ for (i = 0; i < BATADV_ELP_PROBES_PER_NODE; i++) {
+ elp_skb_len = hard_iface->bat_v.elp_skb->len;
+ skb = skb_copy_expand(hard_iface->bat_v.elp_skb, 0,
+ probe_len - elp_skb_len,
+ GFP_ATOMIC);
+ if (!skb)
+ return false;
+
+ /* Tell the skb to get as big as the allocated space (we want
+ * the packet to be exactly of that size to make the link
+ * throughput estimation effective.
+ */
+ skb_put(skb, probe_len - hard_iface->bat_v.elp_skb->len);
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Sending unicast (probe) ELP packet on interface %s to %pM\n",
+ hard_iface->net_dev->name, neigh->addr);
+
+ batadv_send_skb_packet(skb, hard_iface, neigh->addr);
+ }
+
+ return true;
+}
+
+/**
+ * batadv_v_elp_periodic_work - ELP periodic task per interface
+ * @work: work queue item
+ *
+ * Emits broadcast ELP message in regular intervals.
+ */
+static void batadv_v_elp_periodic_work(struct work_struct *work)
+{
+ struct batadv_hardif_neigh_node *hardif_neigh;
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_hard_iface_bat_v *bat_v;
+ struct batadv_elp_packet *elp_packet;
+ struct batadv_priv *bat_priv;
+ struct sk_buff *skb;
+ u32 elp_interval;
+
+ bat_v = container_of(work, struct batadv_hard_iface_bat_v, elp_wq.work);
+ hard_iface = container_of(bat_v, struct batadv_hard_iface, bat_v);
+ bat_priv = netdev_priv(hard_iface->soft_iface);
+
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
+ goto out;
+
+ /* we are in the process of shutting this interface down */
+ if ((hard_iface->if_status == BATADV_IF_NOT_IN_USE) ||
+ (hard_iface->if_status == BATADV_IF_TO_BE_REMOVED))
+ goto out;
+
+ /* the interface was enabled but may not be ready yet */
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ goto restart_timer;
+
+ skb = skb_copy(hard_iface->bat_v.elp_skb, GFP_ATOMIC);
+ if (!skb)
+ goto restart_timer;
+
+ elp_packet = (struct batadv_elp_packet *)skb->data;
+ elp_packet->seqno = htonl(atomic_read(&hard_iface->bat_v.elp_seqno));
+ elp_interval = atomic_read(&hard_iface->bat_v.elp_interval);
+ elp_packet->elp_interval = htonl(elp_interval);
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Sending broadcast ELP packet on interface %s, seqno %u\n",
+ hard_iface->net_dev->name,
+ atomic_read(&hard_iface->bat_v.elp_seqno));
+
+ batadv_send_broadcast_skb(skb, hard_iface);
+
+ atomic_inc(&hard_iface->bat_v.elp_seqno);
+
+ /* The throughput metric is updated on each sent packet. This way, if a
+ * node is dead and no longer sends packets, batman-adv is still able to
+ * react timely to its death.
+ *
+ * The throughput metric is updated by following these steps:
+ * 1) if the hard_iface is wifi => send a number of unicast ELPs for
+ * probing/sampling to each neighbor
+ * 2) update the throughput metric value of each neighbor (note that the
+ * value retrieved in this step might be 100ms old because the
+ * probing packets at point 1) could still be in the HW queue)
+ */
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(hardif_neigh, &hard_iface->neigh_list, list) {
+ if (!batadv_v_elp_wifi_neigh_probe(hardif_neigh))
+ /* if something goes wrong while probing, better to stop
+ * sending packets immediately and reschedule the task
+ */
+ break;
+
+ if (!kref_get_unless_zero(&hardif_neigh->refcount))
+ continue;
+
+ /* Reading the estimated throughput from cfg80211 is a task that
+ * may sleep and that is not allowed in an rcu protected
+ * context. Therefore schedule a task for that.
+ */
+ queue_work(batadv_event_workqueue,
+ &hardif_neigh->bat_v.metric_work);
+ }
+ rcu_read_unlock();
+
+restart_timer:
+ batadv_v_elp_start_timer(hard_iface);
+out:
+ return;
+}
+
+/**
+ * batadv_v_elp_iface_enable - setup the ELP interface private resources
+ * @hard_iface: interface for which the data has to be prepared
+ *
+ * Return: 0 on success or a -ENOMEM in case of failure.
+ */
+int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_elp_packet *elp_packet;
+ unsigned char *elp_buff;
+ u32 random_seqno;
+ size_t size;
+ int res = -ENOMEM;
+
+ size = ETH_HLEN + NET_IP_ALIGN + BATADV_ELP_HLEN;
+ hard_iface->bat_v.elp_skb = dev_alloc_skb(size);
+ if (!hard_iface->bat_v.elp_skb)
+ goto out;
+
+ skb_reserve(hard_iface->bat_v.elp_skb, ETH_HLEN + NET_IP_ALIGN);
+ elp_buff = skb_push(hard_iface->bat_v.elp_skb, BATADV_ELP_HLEN);
+ elp_packet = (struct batadv_elp_packet *)elp_buff;
+ memset(elp_packet, 0, BATADV_ELP_HLEN);
+
+ elp_packet->packet_type = BATADV_ELP;
+ elp_packet->version = BATADV_COMPAT_VERSION;
+
+ /* randomize initial seqno to avoid collision */
+ get_random_bytes(&random_seqno, sizeof(random_seqno));
+ atomic_set(&hard_iface->bat_v.elp_seqno, random_seqno);
+ atomic_set(&hard_iface->bat_v.elp_interval, 500);
+
+ /* assume full-duplex by default */
+ hard_iface->bat_v.flags |= BATADV_FULL_DUPLEX;
+
+ /* warn the user (again) if there is no throughput data is available */
+ hard_iface->bat_v.flags &= ~BATADV_WARNING_DEFAULT;
+
+ if (batadv_is_wifi_netdev(hard_iface->net_dev))
+ hard_iface->bat_v.flags &= ~BATADV_FULL_DUPLEX;
+
+ INIT_DELAYED_WORK(&hard_iface->bat_v.elp_wq,
+ batadv_v_elp_periodic_work);
+ batadv_v_elp_start_timer(hard_iface);
+ res = 0;
+
+out:
+ return res;
+}
+
+/**
+ * batadv_v_elp_iface_disable - release ELP interface private resources
+ * @hard_iface: interface for which the resources have to be released
+ */
+void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface)
+{
+ cancel_delayed_work_sync(&hard_iface->bat_v.elp_wq);
+
+ dev_kfree_skb(hard_iface->bat_v.elp_skb);
+ hard_iface->bat_v.elp_skb = NULL;
+}
+
+/**
+ * batadv_v_elp_primary_iface_set - change internal data to reflect the new
+ * primary interface
+ * @primary_iface: the new primary interface
+ */
+void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_elp_packet *elp_packet;
+ struct sk_buff *skb;
+
+ /* update orig field of every elp iface belonging to this mesh */
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (primary_iface->soft_iface != hard_iface->soft_iface)
+ continue;
+
+ if (!hard_iface->bat_v.elp_skb)
+ continue;
+
+ skb = hard_iface->bat_v.elp_skb;
+ elp_packet = (struct batadv_elp_packet *)skb->data;
+ ether_addr_copy(elp_packet->orig,
+ primary_iface->net_dev->dev_addr);
+ }
+ rcu_read_unlock();
+}
+
+/**
+ * batadv_v_elp_neigh_update - update an ELP neighbour node
+ * @bat_priv: the bat priv with all the soft interface information
+ * @neigh_addr: the neighbour interface address
+ * @if_incoming: the interface the packet was received through
+ * @elp_packet: the received ELP packet
+ *
+ * Updates the ELP neighbour node state with the data received within the new
+ * ELP packet.
+ */
+static void batadv_v_elp_neigh_update(struct batadv_priv *bat_priv,
+ u8 *neigh_addr,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_elp_packet *elp_packet)
+
+{
+ struct batadv_neigh_node *neigh;
+ struct batadv_orig_node *orig_neigh;
+ struct batadv_hardif_neigh_node *hardif_neigh;
+ s32 seqno_diff;
+ s32 elp_latest_seqno;
+
+ orig_neigh = batadv_v_ogm_orig_get(bat_priv, elp_packet->orig);
+ if (!orig_neigh)
+ return;
+
+ neigh = batadv_neigh_node_new(orig_neigh, if_incoming, neigh_addr);
+ if (!neigh)
+ goto orig_free;
+
+ hardif_neigh = batadv_hardif_neigh_get(if_incoming, neigh_addr);
+ if (!hardif_neigh)
+ goto neigh_free;
+
+ elp_latest_seqno = hardif_neigh->bat_v.elp_latest_seqno;
+ seqno_diff = ntohl(elp_packet->seqno) - elp_latest_seqno;
+
+ /* known or older sequence numbers are ignored. However always adopt
+ * if the router seems to have been restarted.
+ */
+ if (seqno_diff < 1 && seqno_diff > -BATADV_ELP_MAX_AGE)
+ goto hardif_free;
+
+ neigh->last_seen = jiffies;
+ hardif_neigh->last_seen = jiffies;
+ hardif_neigh->bat_v.elp_latest_seqno = ntohl(elp_packet->seqno);
+ hardif_neigh->bat_v.elp_interval = ntohl(elp_packet->elp_interval);
+
+hardif_free:
+ if (hardif_neigh)
+ batadv_hardif_neigh_put(hardif_neigh);
+neigh_free:
+ if (neigh)
+ batadv_neigh_node_put(neigh);
+orig_free:
+ if (orig_neigh)
+ batadv_orig_node_put(orig_neigh);
+}
+
+/**
+ * batadv_v_elp_packet_recv - main ELP packet handler
+ * @skb: the received packet
+ * @if_incoming: the interface this packet was received through
+ *
+ * Return: NET_RX_SUCCESS and consumes the skb if the packet was peoperly
+ * processed or NET_RX_DROP in case of failure.
+ */
+int batadv_v_elp_packet_recv(struct sk_buff *skb,
+ struct batadv_hard_iface *if_incoming)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_elp_packet *elp_packet;
+ struct batadv_hard_iface *primary_if;
+ struct ethhdr *ethhdr = (struct ethhdr *)skb_mac_header(skb);
+ bool ret;
+
+ ret = batadv_check_management_packet(skb, if_incoming, BATADV_ELP_HLEN);
+ if (!ret)
+ return NET_RX_DROP;
+
+ if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
+ return NET_RX_DROP;
+
+ /* did we receive a B.A.T.M.A.N. V ELP packet on an interface
+ * that does not have B.A.T.M.A.N. V ELP enabled ?
+ */
+ if (strcmp(bat_priv->bat_algo_ops->name, "BATMAN_V") != 0)
+ return NET_RX_DROP;
+
+ elp_packet = (struct batadv_elp_packet *)skb->data;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Received ELP packet from %pM seqno %u ORIG: %pM\n",
+ ethhdr->h_source, ntohl(elp_packet->seqno),
+ elp_packet->orig);
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ goto out;
+
+ batadv_v_elp_neigh_update(bat_priv, ethhdr->h_source, if_incoming,
+ elp_packet);
+
+out:
+ if (primary_if)
+ batadv_hardif_put(primary_if);
+ consume_skb(skb);
+ return NET_RX_SUCCESS;
+}
diff --git a/net/batman-adv/bat_v_elp.h b/net/batman-adv/bat_v_elp.h
new file mode 100644
index 000000000000..e95f1bca0785
--- /dev/null
+++ b/net/batman-adv/bat_v_elp.h
@@ -0,0 +1,33 @@
+/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+ *
+ * Linus Lüssing, Marek Lindner
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "main.h"
+
+#ifndef _NET_BATMAN_ADV_BAT_V_ELP_H_
+#define _NET_BATMAN_ADV_BAT_V_ELP_H_
+
+struct sk_buff;
+struct work_struct;
+
+int batadv_v_elp_iface_enable(struct batadv_hard_iface *hard_iface);
+void batadv_v_elp_iface_disable(struct batadv_hard_iface *hard_iface);
+void batadv_v_elp_primary_iface_set(struct batadv_hard_iface *primary_iface);
+int batadv_v_elp_packet_recv(struct sk_buff *skb,
+ struct batadv_hard_iface *if_incoming);
+void batadv_v_elp_throughput_metric_update(struct work_struct *work);
+
+#endif /* _NET_BATMAN_ADV_BAT_V_ELP_H_ */
diff --git a/net/batman-adv/bat_v_ogm.c b/net/batman-adv/bat_v_ogm.c
new file mode 100644
index 000000000000..d9bcbe6e7d65
--- /dev/null
+++ b/net/batman-adv/bat_v_ogm.c
@@ -0,0 +1,833 @@
+/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+ *
+ * Antonio Quartulli
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#include "bat_v_ogm.h"
+#include "main.h"
+
+#include <linux/atomic.h>
+#include <linux/byteorder/generic.h>
+#include <linux/errno.h>
+#include <linux/etherdevice.h>
+#include <linux/fs.h>
+#include <linux/if_ether.h>
+#include <linux/jiffies.h>
+#include <linux/kernel.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <linux/random.h>
+#include <linux/rculist.h>
+#include <linux/rcupdate.h>
+#include <linux/skbuff.h>
+#include <linux/slab.h>
+#include <linux/stddef.h>
+#include <linux/string.h>
+#include <linux/types.h>
+#include <linux/workqueue.h>
+
+#include "hard-interface.h"
+#include "hash.h"
+#include "originator.h"
+#include "packet.h"
+#include "routing.h"
+#include "send.h"
+#include "translation-table.h"
+
+/**
+ * batadv_v_ogm_orig_get - retrieve and possibly create an originator node
+ * @bat_priv: the bat priv with all the soft interface information
+ * @addr: the address of the originator
+ *
+ * Return: the orig_node corresponding to the specified address. If such object
+ * does not exist it is allocated here. In case of allocation failure returns
+ * NULL.
+ */
+struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
+ const u8 *addr)
+{
+ struct batadv_orig_node *orig_node;
+ int hash_added;
+
+ orig_node = batadv_orig_hash_find(bat_priv, addr);
+ if (orig_node)
+ return orig_node;
+
+ orig_node = batadv_orig_node_new(bat_priv, addr);
+ if (!orig_node)
+ return NULL;
+
+ hash_added = batadv_hash_add(bat_priv->orig_hash, batadv_compare_orig,
+ batadv_choose_orig, orig_node,
+ &orig_node->hash_entry);
+ if (hash_added != 0) {
+ /* orig_node->refcounter is initialised to 2 by
+ * batadv_orig_node_new()
+ */
+ batadv_orig_node_put(orig_node);
+ batadv_orig_node_put(orig_node);
+ orig_node = NULL;
+ }
+
+ return orig_node;
+}
+
+/**
+ * batadv_v_ogm_start_timer - restart the OGM sending timer
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+static void batadv_v_ogm_start_timer(struct batadv_priv *bat_priv)
+{
+ unsigned long msecs;
+ /* this function may be invoked in different contexts (ogm rescheduling
+ * or hard_iface activation), but the work timer should not be reset
+ */
+ if (delayed_work_pending(&bat_priv->bat_v.ogm_wq))
+ return;
+
+ msecs = atomic_read(&bat_priv->orig_interval) - BATADV_JITTER;
+ msecs += prandom_u32() % (2 * BATADV_JITTER);
+ queue_delayed_work(batadv_event_workqueue, &bat_priv->bat_v.ogm_wq,
+ msecs_to_jiffies(msecs));
+}
+
+/**
+ * batadv_v_ogm_send_to_if - send a batman ogm using a given interface
+ * @skb: the OGM to send
+ * @hard_iface: the interface to use to send the OGM
+ */
+static void batadv_v_ogm_send_to_if(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ return;
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_TX);
+ batadv_add_counter(bat_priv, BATADV_CNT_MGMT_TX_BYTES,
+ skb->len + ETH_HLEN);
+
+ batadv_send_broadcast_skb(skb, hard_iface);
+}
+
+/**
+ * batadv_v_ogm_send - periodic worker broadcasting the own OGM
+ * @work: work queue item
+ */
+static void batadv_v_ogm_send(struct work_struct *work)
+{
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_priv_bat_v *bat_v;
+ struct batadv_priv *bat_priv;
+ struct batadv_ogm2_packet *ogm_packet;
+ struct sk_buff *skb, *skb_tmp;
+ unsigned char *ogm_buff, *pkt_buff;
+ int ogm_buff_len;
+ u16 tvlv_len = 0;
+
+ bat_v = container_of(work, struct batadv_priv_bat_v, ogm_wq.work);
+ bat_priv = container_of(bat_v, struct batadv_priv, bat_v);
+
+ if (atomic_read(&bat_priv->mesh_state) == BATADV_MESH_DEACTIVATING)
+ goto out;
+
+ ogm_buff = bat_priv->bat_v.ogm_buff;
+ ogm_buff_len = bat_priv->bat_v.ogm_buff_len;
+ /* tt changes have to be committed before the tvlv data is
+ * appended as it may alter the tt tvlv container
+ */
+ batadv_tt_local_commit_changes(bat_priv);
+ tvlv_len = batadv_tvlv_container_ogm_append(bat_priv, &ogm_buff,
+ &ogm_buff_len,
+ BATADV_OGM2_HLEN);
+
+ bat_priv->bat_v.ogm_buff = ogm_buff;
+ bat_priv->bat_v.ogm_buff_len = ogm_buff_len;
+
+ skb = netdev_alloc_skb_ip_align(NULL, ETH_HLEN + ogm_buff_len);
+ if (!skb)
+ goto reschedule;
+
+ skb_reserve(skb, ETH_HLEN);
+ pkt_buff = skb_put(skb, ogm_buff_len);
+ memcpy(pkt_buff, ogm_buff, ogm_buff_len);
+
+ ogm_packet = (struct batadv_ogm2_packet *)skb->data;
+ ogm_packet->seqno = htonl(atomic_read(&bat_priv->bat_v.ogm_seqno));
+ atomic_inc(&bat_priv->bat_v.ogm_seqno);
+ ogm_packet->tvlv_len = htons(tvlv_len);
+
+ /* broadcast on every interface */
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->soft_iface != bat_priv->soft_iface)
+ continue;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Sending own OGM2 packet (originator %pM, seqno %u, throughput %u, TTL %d) on interface %s [%pM]\n",
+ ogm_packet->orig, ntohl(ogm_packet->seqno),
+ ntohl(ogm_packet->throughput), ogm_packet->ttl,
+ hard_iface->net_dev->name,
+ hard_iface->net_dev->dev_addr);
+
+ /* this skb gets consumed by batadv_v_ogm_send_to_if() */
+ skb_tmp = skb_clone(skb, GFP_ATOMIC);
+ if (!skb_tmp)
+ break;
+
+ batadv_v_ogm_send_to_if(skb_tmp, hard_iface);
+ }
+ rcu_read_unlock();
+
+ consume_skb(skb);
+
+reschedule:
+ batadv_v_ogm_start_timer(bat_priv);
+out:
+ return;
+}
+
+/**
+ * batadv_v_ogm_iface_enable - prepare an interface for B.A.T.M.A.N. V
+ * @hard_iface: the interface to prepare
+ *
+ * Takes care of scheduling own OGM sending routine for this interface.
+ *
+ * Return: 0 on success or a negative error code otherwise
+ */
+int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+
+ batadv_v_ogm_start_timer(bat_priv);
+
+ return 0;
+}
+
+/**
+ * batadv_v_ogm_primary_iface_set - set a new primary interface
+ * @primary_iface: the new primary interface
+ */
+void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface)
+{
+ struct batadv_priv *bat_priv = netdev_priv(primary_iface->soft_iface);
+ struct batadv_ogm2_packet *ogm_packet;
+
+ if (!bat_priv->bat_v.ogm_buff)
+ return;
+
+ ogm_packet = (struct batadv_ogm2_packet *)bat_priv->bat_v.ogm_buff;
+ ether_addr_copy(ogm_packet->orig, primary_iface->net_dev->dev_addr);
+}
+
+/**
+ * batadv_v_ogm_orig_update - update the originator status based on the received
+ * OGM
+ * @bat_priv: the bat priv with all the soft interface information
+ * @orig_node: the originator to update
+ * @neigh_node: the neighbour the OGM has been received from (to update)
+ * @ogm2: the received OGM
+ * @if_outgoing: the interface where this OGM is going to be forwarded through
+ */
+static void
+batadv_v_ogm_orig_update(struct batadv_priv *bat_priv,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node,
+ const struct batadv_ogm2_packet *ogm2,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_neigh_ifinfo *router_ifinfo = NULL, *neigh_ifinfo = NULL;
+ struct batadv_neigh_node *router = NULL;
+ s32 neigh_seq_diff;
+ u32 neigh_last_seqno;
+ u32 router_last_seqno;
+ u32 router_throughput, neigh_throughput;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Searching and updating originator entry of received packet\n");
+
+ /* if this neighbor already is our next hop there is nothing
+ * to change
+ */
+ router = batadv_orig_router_get(orig_node, if_outgoing);
+ if (router == neigh_node)
+ goto out;
+
+ /* don't consider neighbours with worse throughput.
+ * also switch route if this seqno is BATADV_V_MAX_ORIGDIFF newer than
+ * the last received seqno from our best next hop.
+ */
+ if (router) {
+ router_ifinfo = batadv_neigh_ifinfo_get(router, if_outgoing);
+ neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+
+ /* if these are not allocated, something is wrong. */
+ if (!router_ifinfo || !neigh_ifinfo)
+ goto out;
+
+ neigh_last_seqno = neigh_ifinfo->bat_v.last_seqno;
+ router_last_seqno = router_ifinfo->bat_v.last_seqno;
+ neigh_seq_diff = neigh_last_seqno - router_last_seqno;
+ router_throughput = router_ifinfo->bat_v.throughput;
+ neigh_throughput = neigh_ifinfo->bat_v.throughput;
+
+ if ((neigh_seq_diff < BATADV_OGM_MAX_ORIGDIFF) &&
+ (router_throughput >= neigh_throughput))
+ goto out;
+ }
+
+ batadv_update_route(bat_priv, orig_node, if_outgoing, neigh_node);
+
+out:
+ if (router_ifinfo)
+ batadv_neigh_ifinfo_put(router_ifinfo);
+ if (neigh_ifinfo)
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
+ if (router)
+ batadv_neigh_node_put(router);
+}
+
+/**
+ * batadv_v_forward_penalty - apply a penalty to the throughput metric forwarded
+ * with B.A.T.M.A.N. V OGMs
+ * @bat_priv: the bat priv with all the soft interface information
+ * @if_incoming: the interface where the OGM has been received
+ * @if_outgoing: the interface where the OGM has to be forwarded to
+ * @throughput: the current throughput
+ *
+ * Apply a penalty on the current throughput metric value based on the
+ * characteristic of the interface where the OGM has been received. The return
+ * value is computed as follows:
+ * - throughput * 50% if the incoming and outgoing interface are the
+ * same WiFi interface and the throughput is above
+ * 1MBit/s
+ * - throughput if the outgoing interface is the default
+ * interface (i.e. this OGM is processed for the
+ * internal table and not forwarded)
+ * - throughput * hop penalty otherwise
+ *
+ * Return: the penalised throughput metric.
+ */
+static u32 batadv_v_forward_penalty(struct batadv_priv *bat_priv,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing,
+ u32 throughput)
+{
+ int hop_penalty = atomic_read(&bat_priv->hop_penalty);
+ int hop_penalty_max = BATADV_TQ_MAX_VALUE;
+
+ /* Don't apply hop penalty in default originator table. */
+ if (if_outgoing == BATADV_IF_DEFAULT)
+ return throughput;
+
+ /* Forwarding on the same WiFi interface cuts the throughput in half
+ * due to the store & forward characteristics of WIFI.
+ * Very low throughput values are the exception.
+ */
+ if ((throughput > 10) &&
+ (if_incoming == if_outgoing) &&
+ !(if_incoming->bat_v.flags & BATADV_FULL_DUPLEX))
+ return throughput / 2;
+
+ /* hop penalty of 255 equals 100% */
+ return throughput * (hop_penalty_max - hop_penalty) / hop_penalty_max;
+}
+
+/**
+ * batadv_v_ogm_forward - forward an OGM to the given outgoing interface
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ogm_received: previously received OGM to be forwarded
+ * @throughput: throughput to announce, may vary per outgoing interface
+ * @if_incoming: the interface on which this OGM was received on
+ * @if_outgoing: the interface to which the OGM has to be forwarded to
+ *
+ * Forward an OGM to an interface after having altered the throughput metric and
+ * the TTL value contained in it. The original OGM isn't modified.
+ */
+static void batadv_v_ogm_forward(struct batadv_priv *bat_priv,
+ const struct batadv_ogm2_packet *ogm_received,
+ u32 throughput,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_ogm2_packet *ogm_forward;
+ unsigned char *skb_buff;
+ struct sk_buff *skb;
+ size_t packet_len;
+ u16 tvlv_len;
+
+ if (ogm_received->ttl <= 1) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv, "ttl exceeded\n");
+ return;
+ }
+
+ tvlv_len = ntohs(ogm_received->tvlv_len);
+
+ packet_len = BATADV_OGM2_HLEN + tvlv_len;
+ skb = netdev_alloc_skb_ip_align(if_outgoing->net_dev,
+ ETH_HLEN + packet_len);
+ if (!skb)
+ return;
+
+ skb_reserve(skb, ETH_HLEN);
+ skb_buff = skb_put(skb, packet_len);
+ memcpy(skb_buff, ogm_received, packet_len);
+
+ /* apply forward penalty */
+ ogm_forward = (struct batadv_ogm2_packet *)skb_buff;
+ ogm_forward->throughput = htonl(throughput);
+ ogm_forward->ttl--;
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Forwarding OGM2 packet on %s: throughput %u, ttl %u, received via %s\n",
+ if_outgoing->net_dev->name, throughput, ogm_forward->ttl,
+ if_incoming->net_dev->name);
+
+ batadv_v_ogm_send_to_if(skb, if_outgoing);
+}
+
+/**
+ * batadv_v_ogm_metric_update - update route metric based on OGM
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ogm2: OGM2 structure
+ * @orig_node: Originator structure for which the OGM has been received
+ * @neigh_node: the neigh_node through with the OGM has been received
+ * @if_incoming: the interface where this packet was received
+ * @if_outgoing: the interface for which the packet should be considered
+ *
+ * Return:
+ * 1 if the OGM is new,
+ * 0 if it is not new but valid,
+ * <0 on error (e.g. old OGM)
+ */
+static int batadv_v_ogm_metric_update(struct batadv_priv *bat_priv,
+ const struct batadv_ogm2_packet *ogm2,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_orig_ifinfo *orig_ifinfo = NULL;
+ struct batadv_neigh_ifinfo *neigh_ifinfo = NULL;
+ bool protection_started = false;
+ int ret = -EINVAL;
+ u32 path_throughput;
+ s32 seq_diff;
+
+ orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing);
+ if (!orig_ifinfo)
+ goto out;
+
+ seq_diff = ntohl(ogm2->seqno) - orig_ifinfo->last_real_seqno;
+
+ if (!hlist_empty(&orig_node->neigh_list) &&
+ batadv_window_protected(bat_priv, seq_diff,
+ BATADV_OGM_MAX_AGE,
+ &orig_ifinfo->batman_seqno_reset,
+ &protection_started)) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: packet within window protection time from %pM\n",
+ ogm2->orig);
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Last reset: %ld, %ld\n",
+ orig_ifinfo->batman_seqno_reset, jiffies);
+ goto out;
+ }
+
+ /* drop packets with old seqnos, however accept the first packet after
+ * a host has been rebooted.
+ */
+ if ((seq_diff < 0) && !protection_started)
+ goto out;
+
+ neigh_node->last_seen = jiffies;
+
+ orig_node->last_seen = jiffies;
+
+ orig_ifinfo->last_real_seqno = ntohl(ogm2->seqno);
+ orig_ifinfo->last_ttl = ogm2->ttl;
+
+ neigh_ifinfo = batadv_neigh_ifinfo_new(neigh_node, if_outgoing);
+ if (!neigh_ifinfo)
+ goto out;
+
+ path_throughput = batadv_v_forward_penalty(bat_priv, if_incoming,
+ if_outgoing,
+ ntohl(ogm2->throughput));
+ neigh_ifinfo->bat_v.throughput = path_throughput;
+ neigh_ifinfo->bat_v.last_seqno = ntohl(ogm2->seqno);
+ neigh_ifinfo->last_ttl = ogm2->ttl;
+
+ if (seq_diff > 0 || protection_started)
+ ret = 1;
+ else
+ ret = 0;
+out:
+ if (orig_ifinfo)
+ batadv_orig_ifinfo_put(orig_ifinfo);
+ if (neigh_ifinfo)
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
+
+ return ret;
+}
+
+/**
+ * batadv_v_ogm_route_update - update routes based on OGM
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ethhdr: the Ethernet header of the OGM2
+ * @ogm2: OGM2 structure
+ * @orig_node: Originator structure for which the OGM has been received
+ * @neigh_node: the neigh_node through with the OGM has been received
+ * @if_incoming: the interface where this packet was received
+ * @if_outgoing: the interface for which the packet should be considered
+ */
+static void batadv_v_ogm_route_update(struct batadv_priv *bat_priv,
+ const struct ethhdr *ethhdr,
+ const struct batadv_ogm2_packet *ogm2,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ struct batadv_neigh_node *router = NULL;
+ struct batadv_neigh_ifinfo *neigh_ifinfo = NULL;
+ struct batadv_orig_node *orig_neigh_node = NULL;
+ struct batadv_orig_ifinfo *orig_ifinfo = NULL;
+ struct batadv_neigh_node *orig_neigh_router = NULL;
+
+ neigh_ifinfo = batadv_neigh_ifinfo_get(neigh_node, if_outgoing);
+ if (!neigh_ifinfo)
+ goto out;
+
+ orig_neigh_node = batadv_v_ogm_orig_get(bat_priv, ethhdr->h_source);
+ if (!orig_neigh_node)
+ goto out;
+
+ orig_neigh_router = batadv_orig_router_get(orig_neigh_node,
+ if_outgoing);
+
+ /* drop packet if sender is not a direct neighbor and if we
+ * don't route towards it
+ */
+ router = batadv_orig_router_get(orig_node, if_outgoing);
+ if (router && router->orig_node != orig_node && !orig_neigh_router) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: OGM via unknown neighbor!\n");
+ goto out;
+ }
+
+ if (router)
+ batadv_neigh_node_put(router);
+
+ /* Update routes, and check if the OGM is from the best next hop */
+ batadv_v_ogm_orig_update(bat_priv, orig_node, neigh_node, ogm2,
+ if_outgoing);
+
+ orig_ifinfo = batadv_orig_ifinfo_new(orig_node, if_outgoing);
+ if (!orig_ifinfo)
+ goto out;
+
+ /* don't forward the same seqno twice on one interface */
+ if (orig_ifinfo->last_seqno_forwarded == ntohl(ogm2->seqno))
+ goto out;
+
+ /* acquire possibly updated router */
+ router = batadv_orig_router_get(orig_node, if_outgoing);
+
+ /* strict rule: forward packets coming from the best next hop only */
+ if (neigh_node != router)
+ goto out;
+
+ /* only forward for specific interface, not for the default one. */
+ if (if_outgoing != BATADV_IF_DEFAULT) {
+ orig_ifinfo->last_seqno_forwarded = ntohl(ogm2->seqno);
+ batadv_v_ogm_forward(bat_priv, ogm2,
+ neigh_ifinfo->bat_v.throughput,
+ if_incoming, if_outgoing);
+ }
+
+out:
+ if (orig_ifinfo)
+ batadv_orig_ifinfo_put(orig_ifinfo);
+ if (router)
+ batadv_neigh_node_put(router);
+ if (orig_neigh_router)
+ batadv_neigh_node_put(orig_neigh_router);
+ if (orig_neigh_node)
+ batadv_orig_node_put(orig_neigh_node);
+ if (neigh_ifinfo)
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
+}
+
+/**
+ * batadv_v_ogm_process_per_outif - process a batman v OGM for an outgoing if
+ * @bat_priv: the bat priv with all the soft interface information
+ * @ethhdr: the Ethernet header of the OGM2
+ * @ogm2: OGM2 structure
+ * @orig_node: Originator structure for which the OGM has been received
+ * @neigh_node: the neigh_node through with the OGM has been received
+ * @if_incoming: the interface where this packet was received
+ * @if_outgoing: the interface for which the packet should be considered
+ */
+static void
+batadv_v_ogm_process_per_outif(struct batadv_priv *bat_priv,
+ const struct ethhdr *ethhdr,
+ const struct batadv_ogm2_packet *ogm2,
+ struct batadv_orig_node *orig_node,
+ struct batadv_neigh_node *neigh_node,
+ struct batadv_hard_iface *if_incoming,
+ struct batadv_hard_iface *if_outgoing)
+{
+ int seqno_age;
+
+ /* first, update the metric with according sanity checks */
+ seqno_age = batadv_v_ogm_metric_update(bat_priv, ogm2, orig_node,
+ neigh_node, if_incoming,
+ if_outgoing);
+
+ /* outdated sequence numbers are to be discarded */
+ if (seqno_age < 0)
+ return;
+
+ /* only unknown & newer OGMs contain TVLVs we are interested in */
+ if ((seqno_age > 0) && (if_outgoing == BATADV_IF_DEFAULT))
+ batadv_tvlv_containers_process(bat_priv, true, orig_node,
+ NULL, NULL,
+ (unsigned char *)(ogm2 + 1),
+ ntohs(ogm2->tvlv_len));
+
+ /* if the metric update went through, update routes if needed */
+ batadv_v_ogm_route_update(bat_priv, ethhdr, ogm2, orig_node,
+ neigh_node, if_incoming, if_outgoing);
+}
+
+/**
+ * batadv_v_ogm_aggr_packet - checks if there is another OGM aggregated
+ * @buff_pos: current position in the skb
+ * @packet_len: total length of the skb
+ * @tvlv_len: tvlv length of the previously considered OGM
+ *
+ * Return: true if there is enough space for another OGM, false otherwise.
+ */
+static bool batadv_v_ogm_aggr_packet(int buff_pos, int packet_len,
+ __be16 tvlv_len)
+{
+ int next_buff_pos = 0;
+
+ next_buff_pos += buff_pos + BATADV_OGM2_HLEN;
+ next_buff_pos += ntohs(tvlv_len);
+
+ return (next_buff_pos <= packet_len) &&
+ (next_buff_pos <= BATADV_MAX_AGGREGATION_BYTES);
+}
+
+/**
+ * batadv_v_ogm_process - process an incoming batman v OGM
+ * @skb: the skb containing the OGM
+ * @ogm_offset: offset to the OGM which should be processed (for aggregates)
+ * @if_incoming: the interface where this packet was receved
+ */
+static void batadv_v_ogm_process(const struct sk_buff *skb, int ogm_offset,
+ struct batadv_hard_iface *if_incoming)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct ethhdr *ethhdr;
+ struct batadv_orig_node *orig_node = NULL;
+ struct batadv_hardif_neigh_node *hardif_neigh = NULL;
+ struct batadv_neigh_node *neigh_node = NULL;
+ struct batadv_hard_iface *hard_iface;
+ struct batadv_ogm2_packet *ogm_packet;
+ u32 ogm_throughput, link_throughput, path_throughput;
+
+ ethhdr = eth_hdr(skb);
+ ogm_packet = (struct batadv_ogm2_packet *)(skb->data + ogm_offset);
+
+ ogm_throughput = ntohl(ogm_packet->throughput);
+
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Received OGM2 packet via NB: %pM, IF: %s [%pM] (from OG: %pM, seqno %u, troughput %u, TTL %u, V %u, tvlv_len %u)\n",
+ ethhdr->h_source, if_incoming->net_dev->name,
+ if_incoming->net_dev->dev_addr, ogm_packet->orig,
+ ntohl(ogm_packet->seqno), ogm_throughput, ogm_packet->ttl,
+ ogm_packet->version, ntohs(ogm_packet->tvlv_len));
+
+ /* If the troughput metric is 0, immediately drop the packet. No need to
+ * create orig_node / neigh_node for an unusable route.
+ */
+ if (ogm_throughput == 0) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: originator packet with troughput metric of 0\n");
+ return;
+ }
+
+ /* require ELP packets be to received from this neighbor first */
+ hardif_neigh = batadv_hardif_neigh_get(if_incoming, ethhdr->h_source);
+ if (!hardif_neigh) {
+ batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
+ "Drop packet: OGM via unknown neighbor!\n");
+ goto out;
+ }
+
+ orig_node = batadv_v_ogm_orig_get(bat_priv, ogm_packet->orig);
+ if (!orig_node)
+ return;
+
+ neigh_node = batadv_neigh_node_new(orig_node, if_incoming,
+ ethhdr->h_source);
+ if (!neigh_node)
+ goto out;
+
+ /* Update the received throughput metric to match the link
+ * characteristic:
+ * - If this OGM traveled one hop so far (emitted by single hop
+ * neighbor) the path throughput metric equals the link throughput.
+ * - For OGMs traversing more than hop the path throughput metric is
+ * the smaller of the path throughput and the link throughput.
+ */
+ link_throughput = ewma_throughput_read(&hardif_neigh->bat_v.throughput);
+ path_throughput = min_t(u32, link_throughput, ogm_throughput);
+ ogm_packet->throughput = htonl(path_throughput);
+
+ batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet, orig_node,
+ neigh_node, if_incoming,
+ BATADV_IF_DEFAULT);
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+ if (hard_iface->if_status != BATADV_IF_ACTIVE)
+ continue;
+
+ if (hard_iface->soft_iface != bat_priv->soft_iface)
+ continue;
+
+ batadv_v_ogm_process_per_outif(bat_priv, ethhdr, ogm_packet,
+ orig_node, neigh_node,
+ if_incoming, hard_iface);
+ }
+ rcu_read_unlock();
+out:
+ if (orig_node)
+ batadv_orig_node_put(orig_node);
+ if (neigh_node)
+ batadv_neigh_node_put(neigh_node);
+ if (hardif_neigh)
+ batadv_hardif_neigh_put(hardif_neigh);
+}
+
+/**
+ * batadv_v_ogm_packet_recv - OGM2 receiving handler
+ * @skb: the received OGM
+ * @if_incoming: the interface where this OGM has been received
+ *
+ * Return: NET_RX_SUCCESS and consume the skb on success or returns NET_RX_DROP
+ * (without freeing the skb) on failure
+ */
+int batadv_v_ogm_packet_recv(struct sk_buff *skb,
+ struct batadv_hard_iface *if_incoming)
+{
+ struct batadv_priv *bat_priv = netdev_priv(if_incoming->soft_iface);
+ struct batadv_ogm2_packet *ogm_packet;
+ struct ethhdr *ethhdr = eth_hdr(skb);
+ int ogm_offset;
+ u8 *packet_pos;
+ int ret = NET_RX_DROP;
+
+ /* did we receive a OGM2 packet on an interface that does not have
+ * B.A.T.M.A.N. V enabled ?
+ */
+ if (strcmp(bat_priv->bat_algo_ops->name, "BATMAN_V") != 0)
+ return NET_RX_DROP;
+
+ if (!batadv_check_management_packet(skb, if_incoming, BATADV_OGM2_HLEN))
+ return NET_RX_DROP;
+
+ if (batadv_is_my_mac(bat_priv, ethhdr->h_source))
+ return NET_RX_DROP;
+
+ ogm_packet = (struct batadv_ogm2_packet *)skb->data;
+
+ if (batadv_is_my_mac(bat_priv, ogm_packet->orig))
+ return NET_RX_DROP;
+
+ batadv_inc_counter(bat_priv, BATADV_CNT_MGMT_RX);
+ batadv_add_counter(bat_priv, BATADV_CNT_MGMT_RX_BYTES,
+ skb->len + ETH_HLEN);
+
+ ogm_offset = 0;
+ ogm_packet = (struct batadv_ogm2_packet *)skb->data;
+
+ while (batadv_v_ogm_aggr_packet(ogm_offset, skb_headlen(skb),
+ ogm_packet->tvlv_len)) {
+ batadv_v_ogm_process(skb, ogm_offset, if_incoming);
+
+ ogm_offset += BATADV_OGM2_HLEN;
+ ogm_offset += ntohs(ogm_packet->tvlv_len);
+
+ packet_pos = skb->data + ogm_offset;
+ ogm_packet = (struct batadv_ogm2_packet *)packet_pos;
+ }
+
+ ret = NET_RX_SUCCESS;
+ consume_skb(skb);
+
+ return ret;
+}
+
+/**
+ * batadv_v_ogm_init - initialise the OGM2 engine
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 on success or a negative error code in case of failure
+ */
+int batadv_v_ogm_init(struct batadv_priv *bat_priv)
+{
+ struct batadv_ogm2_packet *ogm_packet;
+ unsigned char *ogm_buff;
+ u32 random_seqno;
+
+ bat_priv->bat_v.ogm_buff_len = BATADV_OGM2_HLEN;
+ ogm_buff = kzalloc(bat_priv->bat_v.ogm_buff_len, GFP_ATOMIC);
+ if (!ogm_buff)
+ return -ENOMEM;
+
+ bat_priv->bat_v.ogm_buff = ogm_buff;
+ ogm_packet = (struct batadv_ogm2_packet *)ogm_buff;
+ ogm_packet->packet_type = BATADV_OGM2;
+ ogm_packet->version = BATADV_COMPAT_VERSION;
+ ogm_packet->ttl = BATADV_TTL;
+ ogm_packet->flags = BATADV_NO_FLAGS;
+ ogm_packet->throughput = htonl(BATADV_THROUGHPUT_MAX_VALUE);
+
+ /* randomize initial seqno to avoid collision */
+ get_random_bytes(&random_seqno, sizeof(random_seqno));
+ atomic_set(&bat_priv->bat_v.ogm_seqno, random_seqno);
+ INIT_DELAYED_WORK(&bat_priv->bat_v.ogm_wq, batadv_v_ogm_send);
+
+ return 0;
+}
+
+/**
+ * batadv_v_ogm_free - free OGM private resources
+ * @bat_priv: the bat priv with all the soft interface information
+ */
+void batadv_v_ogm_free(struct batadv_priv *bat_priv)
+{
+ cancel_delayed_work_sync(&bat_priv->bat_v.ogm_wq);
+
+ kfree(bat_priv->bat_v.ogm_buff);
+ bat_priv->bat_v.ogm_buff = NULL;
+ bat_priv->bat_v.ogm_buff_len = 0;
+}
diff --git a/net/batman-adv/bat_v_ogm.h b/net/batman-adv/bat_v_ogm.h
new file mode 100644
index 000000000000..d849c75ada0e
--- /dev/null
+++ b/net/batman-adv/bat_v_ogm.h
@@ -0,0 +1,36 @@
+/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
+ *
+ * Antonio Quartulli
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of version 2 of the GNU General Public
+ * License as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful, but
+ * WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
+ * General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see <http://www.gnu.org/licenses/>.
+ */
+
+#ifndef _BATMAN_ADV_BATADV_V_OGM_H_
+#define _BATMAN_ADV_BATADV_V_OGM_H_
+
+#include <linux/types.h>
+
+struct batadv_hard_iface;
+struct batadv_priv;
+struct sk_buff;
+
+int batadv_v_ogm_init(struct batadv_priv *bat_priv);
+void batadv_v_ogm_free(struct batadv_priv *bat_priv);
+int batadv_v_ogm_iface_enable(struct batadv_hard_iface *hard_iface);
+struct batadv_orig_node *batadv_v_ogm_orig_get(struct batadv_priv *bat_priv,
+ const u8 *addr);
+void batadv_v_ogm_primary_iface_set(struct batadv_hard_iface *primary_iface);
+int batadv_v_ogm_packet_recv(struct sk_buff *skb,
+ struct batadv_hard_iface *if_incoming);
+
+#endif /* _BATMAN_ADV_BATADV_V_OGM_H_ */
diff --git a/net/batman-adv/bitarray.c b/net/batman-adv/bitarray.c
index 25cbc36e997a..b56bb000a0ab 100644
--- a/net/batman-adv/bitarray.c
+++ b/net/batman-adv/bitarray.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
@@ -29,10 +29,16 @@ static void batadv_bitmap_shift_left(unsigned long *seq_bits, s32 n)
bitmap_shift_left(seq_bits, seq_bits, n, BATADV_TQ_LOCAL_WINDOW_SIZE);
}
-/* receive and process one packet within the sequence number window.
+/**
+ * batadv_bit_get_packet - receive and process one packet within the sequence
+ * number window
+ * @priv: the bat priv with all the soft interface information
+ * @seq_bits: pointer to the sequence number receive packet
+ * @seq_num_diff: difference between the current/received sequence number and
+ * the last sequence number
+ * @set_mark: whether this packet should be marked in seq_bits
*
- * returns:
- * 1 if the window was moved (either new or very old)
+ * Return: 1 if the window was moved (either new or very old),
* 0 if the window was not moved/shifted.
*/
int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff,
diff --git a/net/batman-adv/bitarray.h b/net/batman-adv/bitarray.h
index 0226b220fe5b..3e41bb80eb81 100644
--- a/net/batman-adv/bitarray.h
+++ b/net/batman-adv/bitarray.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
@@ -24,7 +24,14 @@
#include <linux/compiler.h>
#include <linux/types.h>
-/* Returns 1 if the corresponding bit in the given seq_bits indicates true
+/**
+ * batadv_test_bit - check if bit is set in the current window
+ *
+ * @seq_bits: pointer to the sequence number receive packet
+ * @last_seqno: latest sequence number in seq_bits
+ * @curr_seqno: sequence number to test for
+ *
+ * Return: 1 if the corresponding bit in the given seq_bits indicates true
* and curr_seqno is within range of last_seqno. Otherwise returns 0.
*/
static inline int batadv_test_bit(const unsigned long *seq_bits,
@@ -48,9 +55,6 @@ static inline void batadv_set_bit(unsigned long *seq_bits, s32 n)
set_bit(n, seq_bits); /* turn the position on */
}
-/* receive and process one packet, returns 1 if received seq_num is considered
- * new, 0 if old
- */
int batadv_bit_get_packet(void *priv, unsigned long *seq_bits, s32 seq_num_diff,
int set_mark);
diff --git a/net/batman-adv/bridge_loop_avoidance.c b/net/batman-adv/bridge_loop_avoidance.c
index 191a70290dca..0a6c8b824a00 100644
--- a/net/batman-adv/bridge_loop_avoidance.c
+++ b/net/batman-adv/bridge_loop_avoidance.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich
*
@@ -31,6 +31,7 @@
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
+#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
@@ -58,7 +59,13 @@ static void
batadv_bla_send_announce(struct batadv_priv *bat_priv,
struct batadv_bla_backbone_gw *backbone_gw);
-/* return the index of the claim */
+/**
+ * batadv_choose_claim - choose the right bucket for a claim.
+ * @data: data to hash
+ * @size: size of the hash table
+ *
+ * Return: the hash index of the claim
+ */
static inline u32 batadv_choose_claim(const void *data, u32 size)
{
struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data;
@@ -70,7 +77,13 @@ static inline u32 batadv_choose_claim(const void *data, u32 size)
return hash % size;
}
-/* return the index of the backbone gateway */
+/**
+ * batadv_choose_backbone_gw - choose the right bucket for a backbone gateway.
+ * @data: data to hash
+ * @size: size of the hash table
+ *
+ * Return: the hash index of the backbone gateway
+ */
static inline u32 batadv_choose_backbone_gw(const void *data, u32 size)
{
const struct batadv_bla_claim *claim = (struct batadv_bla_claim *)data;
@@ -82,7 +95,13 @@ static inline u32 batadv_choose_backbone_gw(const void *data, u32 size)
return hash % size;
}
-/* compares address and vid of two backbone gws */
+/**
+ * batadv_compare_backbone_gw - compare address and vid of two backbone gws
+ * @node: list node of the first entry to compare
+ * @data2: pointer to the second backbone gateway
+ *
+ * Return: 1 if the backbones have the same data, 0 otherwise
+ */
static int batadv_compare_backbone_gw(const struct hlist_node *node,
const void *data2)
{
@@ -100,7 +119,13 @@ static int batadv_compare_backbone_gw(const struct hlist_node *node,
return 1;
}
-/* compares address and vid of two claims */
+/**
+ * batadv_compare_backbone_gw - compare address and vid of two claims
+ * @node: list node of the first entry to compare
+ * @data2: pointer to the second claims
+ *
+ * Return: 1 if the claim have the same data, 0 otherwise
+ */
static int batadv_compare_claim(const struct hlist_node *node,
const void *data2)
{
@@ -118,39 +143,62 @@ static int batadv_compare_claim(const struct hlist_node *node,
return 1;
}
-/* free a backbone gw */
-static void
-batadv_backbone_gw_free_ref(struct batadv_bla_backbone_gw *backbone_gw)
+/**
+ * batadv_backbone_gw_release - release backbone gw from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the backbone gw
+ */
+static void batadv_backbone_gw_release(struct kref *ref)
{
- if (atomic_dec_and_test(&backbone_gw->refcount))
- kfree_rcu(backbone_gw, rcu);
+ struct batadv_bla_backbone_gw *backbone_gw;
+
+ backbone_gw = container_of(ref, struct batadv_bla_backbone_gw,
+ refcount);
+
+ kfree_rcu(backbone_gw, rcu);
}
-/* finally deinitialize the claim */
-static void batadv_claim_free_rcu(struct rcu_head *rcu)
+/**
+ * batadv_backbone_gw_put - decrement the backbone gw refcounter and possibly
+ * release it
+ * @backbone_gw: backbone gateway to be free'd
+ */
+static void batadv_backbone_gw_put(struct batadv_bla_backbone_gw *backbone_gw)
+{
+ kref_put(&backbone_gw->refcount, batadv_backbone_gw_release);
+}
+
+/**
+ * batadv_claim_release - release claim from lists and queue for free after rcu
+ * grace period
+ * @ref: kref pointer of the claim
+ */
+static void batadv_claim_release(struct kref *ref)
{
struct batadv_bla_claim *claim;
- claim = container_of(rcu, struct batadv_bla_claim, rcu);
+ claim = container_of(ref, struct batadv_bla_claim, refcount);
- batadv_backbone_gw_free_ref(claim->backbone_gw);
- kfree(claim);
+ batadv_backbone_gw_put(claim->backbone_gw);
+ kfree_rcu(claim, rcu);
}
-/* free a claim, call claim_free_rcu if its the last reference */
-static void batadv_claim_free_ref(struct batadv_bla_claim *claim)
+/**
+ * batadv_claim_put - decrement the claim refcounter and possibly
+ * release it
+ * @claim: claim to be free'd
+ */
+static void batadv_claim_put(struct batadv_bla_claim *claim)
{
- if (atomic_dec_and_test(&claim->refcount))
- call_rcu(&claim->rcu, batadv_claim_free_rcu);
+ kref_put(&claim->refcount, batadv_claim_release);
}
/**
- * batadv_claim_hash_find
+ * batadv_claim_hash_find - looks for a claim in the claim hash
* @bat_priv: the bat priv with all the soft interface information
* @data: search data (may be local/static data)
*
- * looks for a claim in the hash, and returns it if found
- * or NULL otherwise.
+ * Return: claim if found or NULL otherwise.
*/
static struct batadv_bla_claim
*batadv_claim_hash_find(struct batadv_priv *bat_priv,
@@ -173,7 +221,7 @@ static struct batadv_bla_claim
if (!batadv_compare_claim(&claim->hash_entry, data))
continue;
- if (!atomic_inc_not_zero(&claim->refcount))
+ if (!kref_get_unless_zero(&claim->refcount))
continue;
claim_tmp = claim;
@@ -185,12 +233,12 @@ static struct batadv_bla_claim
}
/**
- * batadv_backbone_hash_find - looks for a claim in the hash
+ * batadv_backbone_hash_find - looks for a backbone gateway in the hash
* @bat_priv: the bat priv with all the soft interface information
* @addr: the address of the originator
* @vid: the VLAN ID
*
- * Returns claim if found or NULL otherwise.
+ * Return: backbone gateway if found or NULL otherwise
*/
static struct batadv_bla_backbone_gw *
batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr,
@@ -217,7 +265,7 @@ batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr,
&search_entry))
continue;
- if (!atomic_inc_not_zero(&backbone_gw->refcount))
+ if (!kref_get_unless_zero(&backbone_gw->refcount))
continue;
backbone_gw_tmp = backbone_gw;
@@ -228,7 +276,10 @@ batadv_backbone_hash_find(struct batadv_priv *bat_priv, u8 *addr,
return backbone_gw_tmp;
}
-/* delete all claims for a backbone */
+/**
+ * batadv_bla_del_backbone_claims - delete all claims for a backbone
+ * @backbone_gw: backbone gateway where the claims should be removed
+ */
static void
batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw)
{
@@ -253,14 +304,16 @@ batadv_bla_del_backbone_claims(struct batadv_bla_backbone_gw *backbone_gw)
if (claim->backbone_gw != backbone_gw)
continue;
- batadv_claim_free_ref(claim);
+ batadv_claim_put(claim);
hlist_del_rcu(&claim->hash_entry);
}
spin_unlock_bh(list_lock);
}
/* all claims gone, initialize CRC */
+ spin_lock_bh(&backbone_gw->crc_lock);
backbone_gw->crc = BATADV_BLA_CRC_INIT;
+ spin_unlock_bh(&backbone_gw->crc_lock);
}
/**
@@ -370,18 +423,17 @@ static void batadv_bla_send_claim(struct batadv_priv *bat_priv, u8 *mac,
netif_rx(skb);
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
}
/**
- * batadv_bla_get_backbone_gw
+ * batadv_bla_get_backbone_gw - finds or creates a backbone gateway
* @bat_priv: the bat priv with all the soft interface information
* @orig: the mac address of the originator
* @vid: the VLAN ID
* @own_backbone: set if the requested backbone is local
*
- * searches for the backbone gw or creates a new one if it could not
- * be found.
+ * Return: the (possibly created) backbone gateway or NULL on error
*/
static struct batadv_bla_backbone_gw *
batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig,
@@ -408,12 +460,14 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig,
entry->lasttime = jiffies;
entry->crc = BATADV_BLA_CRC_INIT;
entry->bat_priv = bat_priv;
+ spin_lock_init(&entry->crc_lock);
atomic_set(&entry->request_sent, 0);
atomic_set(&entry->wait_periods, 0);
ether_addr_copy(entry->orig, orig);
/* one for the hash, one for returning */
- atomic_set(&entry->refcount, 2);
+ kref_init(&entry->refcount);
+ kref_get(&entry->refcount);
hash_added = batadv_hash_add(bat_priv->bla.backbone_hash,
batadv_compare_backbone_gw,
@@ -431,7 +485,7 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig,
if (orig_node) {
batadv_tt_global_del_orig(bat_priv, orig_node, vid,
"became a backbone gateway");
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
}
if (own_backbone) {
@@ -446,7 +500,13 @@ batadv_bla_get_backbone_gw(struct batadv_priv *bat_priv, u8 *orig,
return entry;
}
-/* update or add the own backbone gw to make sure we announce
+/**
+ * batadv_bla_update_own_backbone_gw - updates the own backbone gw for a VLAN
+ * @bat_priv: the bat priv with all the soft interface information
+ * @primary_if: the selected primary interface
+ * @vid: VLAN identifier
+ *
+ * update or add the own backbone gw to make sure we announce
* where we receive other backbone gws
*/
static void
@@ -463,7 +523,7 @@ batadv_bla_update_own_backbone_gw(struct batadv_priv *bat_priv,
return;
backbone_gw->lasttime = jiffies;
- batadv_backbone_gw_free_ref(backbone_gw);
+ batadv_backbone_gw_put(backbone_gw);
}
/**
@@ -512,7 +572,7 @@ static void batadv_bla_answer_request(struct batadv_priv *bat_priv,
/* finally, send an announcement frame */
batadv_bla_send_announce(bat_priv, backbone_gw);
- batadv_backbone_gw_free_ref(backbone_gw);
+ batadv_backbone_gw_put(backbone_gw);
}
/**
@@ -543,12 +603,9 @@ static void batadv_bla_send_request(struct batadv_bla_backbone_gw *backbone_gw)
}
/**
- * batadv_bla_send_announce
+ * batadv_bla_send_announce - Send an announcement frame
* @bat_priv: the bat priv with all the soft interface information
* @backbone_gw: our backbone gateway which should be announced
- *
- * This function sends an announcement. It is called from multiple
- * places.
*/
static void batadv_bla_send_announce(struct batadv_priv *bat_priv,
struct batadv_bla_backbone_gw *backbone_gw)
@@ -557,7 +614,9 @@ static void batadv_bla_send_announce(struct batadv_priv *bat_priv,
__be16 crc;
memcpy(mac, batadv_announce_mac, 4);
+ spin_lock_bh(&backbone_gw->crc_lock);
crc = htons(backbone_gw->crc);
+ spin_unlock_bh(&backbone_gw->crc_lock);
memcpy(&mac[4], &crc, 2);
batadv_bla_send_claim(bat_priv, mac, backbone_gw->vid,
@@ -594,7 +653,8 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
claim->lasttime = jiffies;
claim->backbone_gw = backbone_gw;
- atomic_set(&claim->refcount, 2);
+ kref_init(&claim->refcount);
+ kref_get(&claim->refcount);
batadv_dbg(BATADV_DBG_BLA, bat_priv,
"bla_add_claim(): adding new entry %pM, vid %d to hash ...\n",
mac, BATADV_PRINT_VID(vid));
@@ -618,22 +678,29 @@ static void batadv_bla_add_claim(struct batadv_priv *bat_priv,
"bla_add_claim(): changing ownership for %pM, vid %d\n",
mac, BATADV_PRINT_VID(vid));
+ spin_lock_bh(&claim->backbone_gw->crc_lock);
claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
- batadv_backbone_gw_free_ref(claim->backbone_gw);
+ spin_unlock_bh(&claim->backbone_gw->crc_lock);
+ batadv_backbone_gw_put(claim->backbone_gw);
}
/* set (new) backbone gw */
- atomic_inc(&backbone_gw->refcount);
+ kref_get(&backbone_gw->refcount);
claim->backbone_gw = backbone_gw;
+ spin_lock_bh(&backbone_gw->crc_lock);
backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
+ spin_unlock_bh(&backbone_gw->crc_lock);
backbone_gw->lasttime = jiffies;
claim_free_ref:
- batadv_claim_free_ref(claim);
+ batadv_claim_put(claim);
}
-/* Delete a claim from the claim hash which has the
- * given mac address and vid.
+/**
+ * batadv_bla_del_claim - delete a claim from the claim hash
+ * @bat_priv: the bat priv with all the soft interface information
+ * @mac: mac address of the claim to be removed
+ * @vid: VLAN id for the claim to be removed
*/
static void batadv_bla_del_claim(struct batadv_priv *bat_priv,
const u8 *mac, const unsigned short vid)
@@ -651,20 +718,30 @@ static void batadv_bla_del_claim(struct batadv_priv *bat_priv,
batadv_hash_remove(bat_priv->bla.claim_hash, batadv_compare_claim,
batadv_choose_claim, claim);
- batadv_claim_free_ref(claim); /* reference from the hash is gone */
+ batadv_claim_put(claim); /* reference from the hash is gone */
+ spin_lock_bh(&claim->backbone_gw->crc_lock);
claim->backbone_gw->crc ^= crc16(0, claim->addr, ETH_ALEN);
+ spin_unlock_bh(&claim->backbone_gw->crc_lock);
/* don't need the reference from hash_find() anymore */
- batadv_claim_free_ref(claim);
+ batadv_claim_put(claim);
}
-/* check for ANNOUNCE frame, return 1 if handled */
+/**
+ * batadv_handle_announce - check for ANNOUNCE frame
+ * @bat_priv: the bat priv with all the soft interface information
+ * @an_addr: announcement mac address (ARP Sender HW address)
+ * @backbone_addr: originator address of the sender (Ethernet source MAC)
+ * @vid: the VLAN ID of the frame
+ *
+ * Return: 1 if handled
+ */
static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
u8 *backbone_addr, unsigned short vid)
{
struct batadv_bla_backbone_gw *backbone_gw;
- u16 crc;
+ u16 backbone_crc, crc;
if (memcmp(an_addr, batadv_announce_mac, 4) != 0)
return 0;
@@ -683,12 +760,16 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
"handle_announce(): ANNOUNCE vid %d (sent by %pM)... CRC = %#.4x\n",
BATADV_PRINT_VID(vid), backbone_gw->orig, crc);
- if (backbone_gw->crc != crc) {
+ spin_lock_bh(&backbone_gw->crc_lock);
+ backbone_crc = backbone_gw->crc;
+ spin_unlock_bh(&backbone_gw->crc_lock);
+
+ if (backbone_crc != crc) {
batadv_dbg(BATADV_DBG_BLA, backbone_gw->bat_priv,
"handle_announce(): CRC FAILED for %pM/%d (my = %#.4x, sent = %#.4x)\n",
backbone_gw->orig,
BATADV_PRINT_VID(backbone_gw->vid),
- backbone_gw->crc, crc);
+ backbone_crc, crc);
batadv_bla_send_request(backbone_gw);
} else {
@@ -701,11 +782,20 @@ static int batadv_handle_announce(struct batadv_priv *bat_priv, u8 *an_addr,
}
}
- batadv_backbone_gw_free_ref(backbone_gw);
+ batadv_backbone_gw_put(backbone_gw);
return 1;
}
-/* check for REQUEST frame, return 1 if handled */
+/**
+ * batadv_handle_request - check for REQUEST frame
+ * @bat_priv: the bat priv with all the soft interface information
+ * @primary_if: the primary hard interface of this batman soft interface
+ * @backbone_addr: backbone address to be requested (ARP sender HW MAC)
+ * @ethhdr: ethernet header of a packet
+ * @vid: the VLAN ID of the frame
+ *
+ * Return: 1 if handled
+ */
static int batadv_handle_request(struct batadv_priv *bat_priv,
struct batadv_hard_iface *primary_if,
u8 *backbone_addr, struct ethhdr *ethhdr,
@@ -729,7 +819,16 @@ static int batadv_handle_request(struct batadv_priv *bat_priv,
return 1;
}
-/* check for UNCLAIM frame, return 1 if handled */
+/**
+ * batadv_handle_unclaim - check for UNCLAIM frame
+ * @bat_priv: the bat priv with all the soft interface information
+ * @primary_if: the primary hard interface of this batman soft interface
+ * @backbone_addr: originator address of the backbone (Ethernet source)
+ * @claim_addr: Client to be unclaimed (ARP sender HW MAC)
+ * @vid: the VLAN ID of the frame
+ *
+ * Return: 1 if handled
+ */
static int batadv_handle_unclaim(struct batadv_priv *bat_priv,
struct batadv_hard_iface *primary_if,
u8 *backbone_addr, u8 *claim_addr,
@@ -754,11 +853,20 @@ static int batadv_handle_unclaim(struct batadv_priv *bat_priv,
claim_addr, BATADV_PRINT_VID(vid), backbone_gw->orig);
batadv_bla_del_claim(bat_priv, claim_addr, vid);
- batadv_backbone_gw_free_ref(backbone_gw);
+ batadv_backbone_gw_put(backbone_gw);
return 1;
}
-/* check for CLAIM frame, return 1 if handled */
+/**
+ * batadv_handle_claim - check for CLAIM frame
+ * @bat_priv: the bat priv with all the soft interface information
+ * @primary_if: the primary hard interface of this batman soft interface
+ * @backbone_addr: originator address of the backbone (Ethernet Source)
+ * @claim_addr: client mac address to be claimed (ARP sender HW MAC)
+ * @vid: the VLAN ID of the frame
+ *
+ * Return: 1 if handled
+ */
static int batadv_handle_claim(struct batadv_priv *bat_priv,
struct batadv_hard_iface *primary_if,
u8 *backbone_addr, u8 *claim_addr,
@@ -782,12 +890,12 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv,
/* TODO: we could call something like tt_local_del() here. */
- batadv_backbone_gw_free_ref(backbone_gw);
+ batadv_backbone_gw_put(backbone_gw);
return 1;
}
/**
- * batadv_check_claim_group
+ * batadv_check_claim_group - check for claim group membership
* @bat_priv: the bat priv with all the soft interface information
* @primary_if: the primary interface of this batman interface
* @hw_src: the Hardware source in the ARP Header
@@ -798,7 +906,7 @@ static int batadv_handle_claim(struct batadv_priv *bat_priv,
* This function also applies the group ID of the sender
* if it is in the same mesh.
*
- * returns:
+ * Return:
* 2 - if it is a claim packet and on the same group
* 1 - if is a claim packet from another group
* 0 - if it is not a claim packet
@@ -856,20 +964,18 @@ static int batadv_check_claim_group(struct batadv_priv *bat_priv,
bla_dst_own->group = bla_dst->group;
}
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
return 2;
}
/**
- * batadv_bla_process_claim
+ * batadv_bla_process_claim - Check if this is a claim frame, and process it
* @bat_priv: the bat priv with all the soft interface information
* @primary_if: the primary hard interface of this batman soft interface
* @skb: the frame to be checked
*
- * Check if this is a claim frame, and process it accordingly.
- *
- * returns 1 if it was a claim frame, otherwise return 0 to
+ * Return: 1 if it was a claim frame, otherwise return 0 to
* tell the callee that it can use the frame on its own.
*/
static int batadv_bla_process_claim(struct batadv_priv *bat_priv,
@@ -1000,7 +1106,13 @@ static int batadv_bla_process_claim(struct batadv_priv *bat_priv,
return 1;
}
-/* Check when we last heard from other nodes, and remove them in case of
+/**
+ * batadv_bla_purge_backbone_gw - Remove backbone gateways after a timeout or
+ * immediately
+ * @bat_priv: the bat priv with all the soft interface information
+ * @now: whether the whole hash shall be wiped now
+ *
+ * Check when we last heard from other nodes, and remove them in case of
* a time out, or clean all backbone gws if now is set.
*/
static void batadv_bla_purge_backbone_gw(struct batadv_priv *bat_priv, int now)
@@ -1041,14 +1153,14 @@ purge_now:
batadv_bla_del_backbone_claims(backbone_gw);
hlist_del_rcu(&backbone_gw->hash_entry);
- batadv_backbone_gw_free_ref(backbone_gw);
+ batadv_backbone_gw_put(backbone_gw);
}
spin_unlock_bh(list_lock);
}
}
/**
- * batadv_bla_purge_claims
+ * batadv_bla_purge_claims - Remove claims after a timeout or immediately
* @bat_priv: the bat priv with all the soft interface information
* @primary_if: the selected primary interface, may be NULL if now is set
* @now: whether the whole hash shall be wiped now
@@ -1097,12 +1209,11 @@ purge_now:
}
/**
- * batadv_bla_update_orig_address
+ * batadv_bla_update_orig_address - Update the backbone gateways when the own
+ * originator address changes
* @bat_priv: the bat priv with all the soft interface information
* @primary_if: the new selected primary_if
* @oldif: the old primary interface, may be NULL
- *
- * Update the backbone gateways when the own orig address changes.
*/
void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
struct batadv_hard_iface *primary_if,
@@ -1153,7 +1264,31 @@ void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
}
}
-/* periodic work to do:
+/**
+ * batadv_bla_status_update - purge bla interfaces if necessary
+ * @net_dev: the soft interface net device
+ */
+void batadv_bla_status_update(struct net_device *net_dev)
+{
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ struct batadv_hard_iface *primary_if;
+
+ primary_if = batadv_primary_if_get_selected(bat_priv);
+ if (!primary_if)
+ return;
+
+ /* this function already purges everything when bla is disabled,
+ * so just call that one.
+ */
+ batadv_bla_update_orig_address(bat_priv, primary_if, primary_if);
+ batadv_hardif_put(primary_if);
+}
+
+/**
+ * batadv_bla_periodic_work - performs periodic bla work
+ * @work: kernel work struct
+ *
+ * periodic work to do:
* * purge structures when they are too old
* * send announcements
*/
@@ -1220,7 +1355,7 @@ static void batadv_bla_periodic_work(struct work_struct *work)
}
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
queue_delayed_work(batadv_event_workqueue, &bat_priv->bla.work,
msecs_to_jiffies(BATADV_BLA_PERIOD_LENGTH));
@@ -1234,7 +1369,12 @@ out:
static struct lock_class_key batadv_claim_hash_lock_class_key;
static struct lock_class_key batadv_backbone_hash_lock_class_key;
-/* initialize all bla structures */
+/**
+ * batadv_bla_init - initialize all bla structures
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 on success, < 0 on error.
+ */
int batadv_bla_init(struct batadv_priv *bat_priv)
{
int i;
@@ -1254,7 +1394,7 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
if (primary_if) {
crc = crc16(0, primary_if->net_dev->dev_addr, ETH_ALEN);
bat_priv->bla.claim_dest.group = htons(crc);
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
} else {
bat_priv->bla.claim_dest.group = 0; /* will be set later */
}
@@ -1289,7 +1429,7 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
}
/**
- * batadv_bla_check_bcast_duplist
+ * batadv_bla_check_bcast_duplist - Check if a frame is in the broadcast dup.
* @bat_priv: the bat priv with all the soft interface information
* @skb: contains the bcast_packet to be checked
*
@@ -1301,6 +1441,8 @@ int batadv_bla_init(struct batadv_priv *bat_priv)
* with a good chance that it is the same packet. If it is furthermore
* sent by another host, drop it. We allow equal packets from
* the same host however as this might be intended.
+ *
+ * Return: 1 if a packet is in the duplicate list, 0 otherwise.
*/
int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
struct sk_buff *skb)
@@ -1359,14 +1501,13 @@ out:
}
/**
- * batadv_bla_is_backbone_gw_orig
+ * batadv_bla_is_backbone_gw_orig - Check if the originator is a gateway for
+ * the VLAN identified by vid.
* @bat_priv: the bat priv with all the soft interface information
* @orig: originator mac address
* @vid: VLAN identifier
*
- * Check if the originator is a gateway for the VLAN identified by vid.
- *
- * Returns true if orig is a backbone for this vid, false otherwise.
+ * Return: true if orig is a backbone for this vid, false otherwise.
*/
bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
unsigned short vid)
@@ -1400,14 +1541,13 @@ bool batadv_bla_is_backbone_gw_orig(struct batadv_priv *bat_priv, u8 *orig,
}
/**
- * batadv_bla_is_backbone_gw
+ * batadv_bla_is_backbone_gw - check if originator is a backbone gw for a VLAN.
* @skb: the frame to be checked
* @orig_node: the orig_node of the frame
* @hdr_size: maximum length of the frame
*
- * bla_is_backbone_gw inspects the skb for the VLAN ID and returns 1
- * if the orig_node is also a gateway on the soft interface, otherwise it
- * returns 0.
+ * Return: 1 if the orig_node is also a gateway on the soft interface, otherwise
+ * it returns 0.
*/
int batadv_bla_is_backbone_gw(struct sk_buff *skb,
struct batadv_orig_node *orig_node, int hdr_size)
@@ -1430,11 +1570,16 @@ int batadv_bla_is_backbone_gw(struct sk_buff *skb,
if (!backbone_gw)
return 0;
- batadv_backbone_gw_free_ref(backbone_gw);
+ batadv_backbone_gw_put(backbone_gw);
return 1;
}
-/* free all bla structures (for softinterface free or module unload) */
+/**
+ * batadv_bla_init - free all bla structures
+ * @bat_priv: the bat priv with all the soft interface information
+ *
+ * for softinterface free or module unload
+ */
void batadv_bla_free(struct batadv_priv *bat_priv)
{
struct batadv_hard_iface *primary_if;
@@ -1453,22 +1598,23 @@ void batadv_bla_free(struct batadv_priv *bat_priv)
bat_priv->bla.backbone_hash = NULL;
}
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
}
/**
- * batadv_bla_rx
+ * batadv_bla_rx - check packets coming from the mesh.
* @bat_priv: the bat priv with all the soft interface information
* @skb: the frame to be checked
* @vid: the VLAN ID of the frame
* @is_bcast: the packet came in a broadcast packet type.
*
- * bla_rx avoidance checks if:
+ * batadv_bla_rx avoidance checks if:
* * we have to race for a claim
* * if the frame is allowed on the LAN
*
- * in these cases, the skb is further handled by this function and
- * returns 1, otherwise it returns 0 and the caller shall further
+ * in these cases, the skb is further handled by this function
+ *
+ * Return: 1 if handled, otherwise it returns 0 and the caller shall further
* process the skb.
*/
int batadv_bla_rx(struct batadv_priv *bat_priv, struct sk_buff *skb,
@@ -1545,27 +1691,28 @@ handled:
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
if (claim)
- batadv_claim_free_ref(claim);
+ batadv_claim_put(claim);
return ret;
}
/**
- * batadv_bla_tx
+ * batadv_bla_tx - check packets going into the mesh
* @bat_priv: the bat priv with all the soft interface information
* @skb: the frame to be checked
* @vid: the VLAN ID of the frame
*
- * bla_tx checks if:
+ * batadv_bla_tx checks if:
* * a claim was received which has to be processed
* * the frame is allowed on the mesh
*
- * in these cases, the skb is further handled by this function and
- * returns 1, otherwise it returns 0 and the caller shall further
- * process the skb.
+ * in these cases, the skb is further handled by this function.
*
* This call might reallocate skb data.
+ *
+ * Return: 1 if handled, otherwise it returns 0 and the caller shall further
+ * process the skb.
*/
int batadv_bla_tx(struct batadv_priv *bat_priv, struct sk_buff *skb,
unsigned short vid)
@@ -1633,12 +1780,19 @@ handled:
ret = 1;
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
if (claim)
- batadv_claim_free_ref(claim);
+ batadv_claim_put(claim);
return ret;
}
+/**
+ * batadv_bla_claim_table_seq_print_text - print the claim table in a seq file
+ * @seq: seq file to print on
+ * @offset: not used
+ *
+ * Return: always 0
+ */
int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset)
{
struct net_device *net_dev = (struct net_device *)seq->private;
@@ -1647,6 +1801,7 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset)
struct batadv_bla_claim *claim;
struct batadv_hard_iface *primary_if;
struct hlist_head *head;
+ u16 backbone_crc;
u32 i;
bool is_own;
u8 *primary_addr;
@@ -1669,20 +1824,32 @@ int batadv_bla_claim_table_seq_print_text(struct seq_file *seq, void *offset)
hlist_for_each_entry_rcu(claim, head, hash_entry) {
is_own = batadv_compare_eth(claim->backbone_gw->orig,
primary_addr);
+
+ spin_lock_bh(&claim->backbone_gw->crc_lock);
+ backbone_crc = claim->backbone_gw->crc;
+ spin_unlock_bh(&claim->backbone_gw->crc_lock);
seq_printf(seq, " * %pM on %5d by %pM [%c] (%#.4x)\n",
claim->addr, BATADV_PRINT_VID(claim->vid),
claim->backbone_gw->orig,
(is_own ? 'x' : ' '),
- claim->backbone_gw->crc);
+ backbone_crc);
}
rcu_read_unlock();
}
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
return 0;
}
+/**
+ * batadv_bla_backbone_table_seq_print_text - print the backbone table in a seq
+ * file
+ * @seq: seq file to print on
+ * @offset: not used
+ *
+ * Return: always 0
+ */
int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset)
{
struct net_device *net_dev = (struct net_device *)seq->private;
@@ -1692,6 +1859,7 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset)
struct batadv_hard_iface *primary_if;
struct hlist_head *head;
int secs, msecs;
+ u16 backbone_crc;
u32 i;
bool is_own;
u8 *primary_addr;
@@ -1722,15 +1890,19 @@ int batadv_bla_backbone_table_seq_print_text(struct seq_file *seq, void *offset)
if (is_own)
continue;
+ spin_lock_bh(&backbone_gw->crc_lock);
+ backbone_crc = backbone_gw->crc;
+ spin_unlock_bh(&backbone_gw->crc_lock);
+
seq_printf(seq, " * %pM on %5d %4i.%03is (%#.4x)\n",
backbone_gw->orig,
BATADV_PRINT_VID(backbone_gw->vid), secs,
- msecs, backbone_gw->crc);
+ msecs, backbone_crc);
}
rcu_read_unlock();
}
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
return 0;
}
diff --git a/net/batman-adv/bridge_loop_avoidance.h b/net/batman-adv/bridge_loop_avoidance.h
index 025152b34282..579f0fa6fe6a 100644
--- a/net/batman-adv/bridge_loop_avoidance.h
+++ b/net/batman-adv/bridge_loop_avoidance.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich
*
@@ -22,6 +22,7 @@
#include <linux/types.h>
+struct net_device;
struct seq_file;
struct sk_buff;
@@ -42,6 +43,7 @@ int batadv_bla_check_bcast_duplist(struct batadv_priv *bat_priv,
void batadv_bla_update_orig_address(struct batadv_priv *bat_priv,
struct batadv_hard_iface *primary_if,
struct batadv_hard_iface *oldif);
+void batadv_bla_status_update(struct net_device *net_dev);
int batadv_bla_init(struct batadv_priv *bat_priv);
void batadv_bla_free(struct batadv_priv *bat_priv);
diff --git a/net/batman-adv/debugfs.c b/net/batman-adv/debugfs.c
index c4c1e8030ba0..48253cf8341b 100644
--- a/net/batman-adv/debugfs.c
+++ b/net/batman-adv/debugfs.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -262,6 +262,13 @@ static int batadv_algorithms_open(struct inode *inode, struct file *file)
return single_open(file, batadv_algo_seq_print_text, NULL);
}
+static int neighbors_open(struct inode *inode, struct file *file)
+{
+ struct net_device *net_dev = (struct net_device *)inode->i_private;
+
+ return single_open(file, batadv_hardif_neigh_seq_print_text, net_dev);
+}
+
static int batadv_originators_open(struct inode *inode, struct file *file)
{
struct net_device *net_dev = (struct net_device *)inode->i_private;
@@ -274,6 +281,8 @@ static int batadv_originators_open(struct inode *inode, struct file *file)
* originator table of an hard interface
* @inode: inode pointer to debugfs file
* @file: pointer to the seq_file
+ *
+ * Return: 0 on success or negative error number in case of failure
*/
static int batadv_originators_hardif_open(struct inode *inode,
struct file *file)
@@ -322,6 +331,8 @@ static int batadv_bla_backbone_table_open(struct inode *inode,
* batadv_dat_cache_open - Prepare file handler for reads from dat_chache
* @inode: inode which was opened
* @file: file handle to be initialized
+ *
+ * Return: 0 on success or negative error number in case of failure
*/
static int batadv_dat_cache_open(struct inode *inode, struct file *file)
{
@@ -375,6 +386,7 @@ static struct batadv_debuginfo *batadv_general_debuginfos[] = {
};
/* The following attributes are per soft interface */
+static BATADV_DEBUGINFO(neighbors, S_IRUGO, neighbors_open);
static BATADV_DEBUGINFO(originators, S_IRUGO, batadv_originators_open);
static BATADV_DEBUGINFO(gateways, S_IRUGO, batadv_gateways_open);
static BATADV_DEBUGINFO(transtable_global, S_IRUGO,
@@ -394,6 +406,7 @@ static BATADV_DEBUGINFO(nc_nodes, S_IRUGO, batadv_nc_nodes_open);
#endif
static struct batadv_debuginfo *batadv_mesh_debuginfos[] = {
+ &batadv_debuginfo_neighbors,
&batadv_debuginfo_originators,
&batadv_debuginfo_gateways,
&batadv_debuginfo_transtable_global,
@@ -474,6 +487,8 @@ void batadv_debugfs_destroy(void)
* batadv_debugfs_add_hardif - creates the base directory for a hard interface
* in debugfs.
* @hard_iface: hard interface which should be added.
+ *
+ * Return: 0 on success or negative error number in case of failure
*/
int batadv_debugfs_add_hardif(struct batadv_hard_iface *hard_iface)
{
diff --git a/net/batman-adv/debugfs.h b/net/batman-adv/debugfs.h
index 80ab8d6f0ab3..1ab4e2e63afc 100644
--- a/net/batman-adv/debugfs.h
+++ b/net/batman-adv/debugfs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/distributed-arp-table.c b/net/batman-adv/distributed-arp-table.c
index 83bc1aaf5800..e96d7c745b4a 100644
--- a/net/batman-adv/distributed-arp-table.c
+++ b/net/batman-adv/distributed-arp-table.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
*
@@ -30,6 +30,7 @@
#include <linux/in.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
+#include <linux/kref.h>
#include <linux/list.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
@@ -62,21 +63,34 @@ static void batadv_dat_start_timer(struct batadv_priv *bat_priv)
}
/**
- * batadv_dat_entry_free_ref - decrement the dat_entry refcounter and possibly
- * free it
- * @dat_entry: the entry to free
+ * batadv_dat_entry_release - release dat_entry from lists and queue for free
+ * after rcu grace period
+ * @ref: kref pointer of the dat_entry
*/
-static void batadv_dat_entry_free_ref(struct batadv_dat_entry *dat_entry)
+static void batadv_dat_entry_release(struct kref *ref)
{
- if (atomic_dec_and_test(&dat_entry->refcount))
- kfree_rcu(dat_entry, rcu);
+ struct batadv_dat_entry *dat_entry;
+
+ dat_entry = container_of(ref, struct batadv_dat_entry, refcount);
+
+ kfree_rcu(dat_entry, rcu);
+}
+
+/**
+ * batadv_dat_entry_put - decrement the dat_entry refcounter and possibly
+ * release it
+ * @dat_entry: dat_entry to be free'd
+ */
+static void batadv_dat_entry_put(struct batadv_dat_entry *dat_entry)
+{
+ kref_put(&dat_entry->refcount, batadv_dat_entry_release);
}
/**
* batadv_dat_to_purge - check whether a dat_entry has to be purged or not
* @dat_entry: the entry to check
*
- * Returns true if the entry has to be purged now, false otherwise.
+ * Return: true if the entry has to be purged now, false otherwise.
*/
static bool batadv_dat_to_purge(struct batadv_dat_entry *dat_entry)
{
@@ -121,7 +135,7 @@ static void __batadv_dat_purge(struct batadv_priv *bat_priv,
continue;
hlist_del_rcu(&dat_entry->hash_entry);
- batadv_dat_entry_free_ref(dat_entry);
+ batadv_dat_entry_put(dat_entry);
}
spin_unlock_bh(list_lock);
}
@@ -151,7 +165,7 @@ static void batadv_dat_purge(struct work_struct *work)
* @node: node in the local table
* @data2: second object to compare the node to
*
- * Returns 1 if the two entries are the same, 0 otherwise.
+ * Return: 1 if the two entries are the same, 0 otherwise.
*/
static int batadv_compare_dat(const struct hlist_node *node, const void *data2)
{
@@ -166,7 +180,7 @@ static int batadv_compare_dat(const struct hlist_node *node, const void *data2)
* @skb: ARP packet
* @hdr_size: size of the possible header before the ARP packet
*
- * Returns the value of the hw_src field in the ARP packet.
+ * Return: the value of the hw_src field in the ARP packet.
*/
static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size)
{
@@ -183,7 +197,7 @@ static u8 *batadv_arp_hw_src(struct sk_buff *skb, int hdr_size)
* @skb: ARP packet
* @hdr_size: size of the possible header before the ARP packet
*
- * Returns the value of the ip_src field in the ARP packet.
+ * Return: the value of the ip_src field in the ARP packet.
*/
static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size)
{
@@ -195,7 +209,7 @@ static __be32 batadv_arp_ip_src(struct sk_buff *skb, int hdr_size)
* @skb: ARP packet
* @hdr_size: size of the possible header before the ARP packet
*
- * Returns the value of the hw_dst field in the ARP packet.
+ * Return: the value of the hw_dst field in the ARP packet.
*/
static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size)
{
@@ -207,7 +221,7 @@ static u8 *batadv_arp_hw_dst(struct sk_buff *skb, int hdr_size)
* @skb: ARP packet
* @hdr_size: size of the possible header before the ARP packet
*
- * Returns the value of the ip_dst field in the ARP packet.
+ * Return: the value of the ip_dst field in the ARP packet.
*/
static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size)
{
@@ -219,7 +233,7 @@ static __be32 batadv_arp_ip_dst(struct sk_buff *skb, int hdr_size)
* @data: data to hash
* @size: size of the hash table
*
- * Returns the selected index in the hash table for the given data.
+ * Return: the selected index in the hash table for the given data.
*/
static u32 batadv_hash_dat(const void *data, u32 size)
{
@@ -256,7 +270,7 @@ static u32 batadv_hash_dat(const void *data, u32 size)
* @ip: search key
* @vid: VLAN identifier
*
- * Returns the dat_entry if found, NULL otherwise.
+ * Return: the dat_entry if found, NULL otherwise.
*/
static struct batadv_dat_entry *
batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip,
@@ -281,7 +295,7 @@ batadv_dat_entry_hash_find(struct batadv_priv *bat_priv, __be32 ip,
if (dat_entry->ip != ip)
continue;
- if (!atomic_inc_not_zero(&dat_entry->refcount))
+ if (!kref_get_unless_zero(&dat_entry->refcount))
continue;
dat_entry_tmp = dat_entry;
@@ -326,7 +340,8 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip,
dat_entry->vid = vid;
ether_addr_copy(dat_entry->mac_addr, mac_addr);
dat_entry->last_update = jiffies;
- atomic_set(&dat_entry->refcount, 2);
+ kref_init(&dat_entry->refcount);
+ kref_get(&dat_entry->refcount);
hash_added = batadv_hash_add(bat_priv->dat.hash, batadv_compare_dat,
batadv_hash_dat, dat_entry,
@@ -334,7 +349,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip,
if (unlikely(hash_added != 0)) {
/* remove the reference for the hash */
- batadv_dat_entry_free_ref(dat_entry);
+ batadv_dat_entry_put(dat_entry);
goto out;
}
@@ -343,7 +358,7 @@ static void batadv_dat_entry_add(struct batadv_priv *bat_priv, __be32 ip,
out:
if (dat_entry)
- batadv_dat_entry_free_ref(dat_entry);
+ batadv_dat_entry_put(dat_entry);
}
#ifdef CONFIG_BATMAN_ADV_DEBUG
@@ -440,7 +455,7 @@ static void batadv_dbg_arp(struct batadv_priv *bat_priv, struct sk_buff *skb,
* @candidate: orig_node under evaluation
* @max_orig_node: last selected candidate
*
- * Returns true if the node has been elected as next candidate or false
+ * Return: true if the node has been elected as next candidate or false
* otherwise.
*/
static bool batadv_is_orig_node_eligible(struct batadv_dat_candidate *res,
@@ -527,12 +542,12 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv,
max_orig_node))
continue;
- if (!atomic_inc_not_zero(&orig_node->refcount))
+ if (!kref_get_unless_zero(&orig_node->refcount))
continue;
max = tmp_max;
if (max_orig_node)
- batadv_orig_node_free_ref(max_orig_node);
+ batadv_orig_node_put(max_orig_node);
max_orig_node = orig_node;
}
rcu_read_unlock();
@@ -558,7 +573,7 @@ static void batadv_choose_next_candidate(struct batadv_priv *bat_priv,
* closest values (from the LEFT, with wrap around if needed) then the hash
* value of the key. ip_dst is the key.
*
- * Returns the candidate array of size BATADV_DAT_CANDIDATE_NUM.
+ * Return: the candidate array of size BATADV_DAT_CANDIDATE_NUM.
*/
static struct batadv_dat_candidate *
batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst)
@@ -566,6 +581,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst)
int select;
batadv_dat_addr_t last_max = BATADV_DAT_ADDR_MAX, ip_key;
struct batadv_dat_candidate *res;
+ struct batadv_dat_entry dat;
if (!bat_priv->orig_hash)
return NULL;
@@ -575,7 +591,9 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst)
if (!res)
return NULL;
- ip_key = (batadv_dat_addr_t)batadv_hash_dat(&ip_dst,
+ dat.ip = ip_dst;
+ dat.vid = 0;
+ ip_key = (batadv_dat_addr_t)batadv_hash_dat(&dat,
BATADV_DAT_ADDR_MAX);
batadv_dbg(BATADV_DBG_DAT, bat_priv,
@@ -599,7 +617,7 @@ batadv_dat_select_candidates(struct batadv_priv *bat_priv, __be32 ip_dst)
* This function copies the skb with pskb_copy() and is sent as unicast packet
* to each of the selected candidates.
*
- * Returns true if the packet is sent to at least one candidate, false
+ * Return: true if the packet is sent to at least one candidate, false
* otherwise.
*/
static bool batadv_dat_send_data(struct batadv_priv *bat_priv,
@@ -636,9 +654,7 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv,
goto free_neigh;
}
- send_status = batadv_send_skb_packet(tmp_skb,
- neigh_node->if_incoming,
- neigh_node->addr);
+ send_status = batadv_send_unicast_skb(tmp_skb, neigh_node);
if (send_status == NET_XMIT_SUCCESS) {
/* count the sent packet */
switch (packet_subtype) {
@@ -656,9 +672,9 @@ static bool batadv_dat_send_data(struct batadv_priv *bat_priv,
ret = true;
}
free_neigh:
- batadv_neigh_node_free_ref(neigh_node);
+ batadv_neigh_node_put(neigh_node);
free_orig:
- batadv_orig_node_free_ref(cand[i].orig_node);
+ batadv_orig_node_put(cand[i].orig_node);
}
out:
@@ -738,6 +754,8 @@ static void batadv_dat_hash_free(struct batadv_priv *bat_priv)
/**
* batadv_dat_init - initialise the DAT internals
* @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 in case of success, a negative error code otherwise
*/
int batadv_dat_init(struct batadv_priv *bat_priv)
{
@@ -776,6 +794,8 @@ void batadv_dat_free(struct batadv_priv *bat_priv)
* batadv_dat_cache_seq_print_text - print the local DAT hash table
* @seq: seq file to print on
* @offset: not used
+ *
+ * Return: always 0
*/
int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset)
{
@@ -818,7 +838,7 @@ int batadv_dat_cache_seq_print_text(struct seq_file *seq, void *offset)
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
return 0;
}
@@ -828,7 +848,7 @@ out:
* @skb: packet to analyse
* @hdr_size: size of the possible header before the ARP packet in the skb
*
- * Returns the ARP type if the skb contains a valid ARP packet, 0 otherwise.
+ * Return: the ARP type if the skb contains a valid ARP packet, 0 otherwise.
*/
static u16 batadv_arp_get_type(struct batadv_priv *bat_priv,
struct sk_buff *skb, int hdr_size)
@@ -901,8 +921,9 @@ out:
* @skb: the buffer containing the packet to extract the VID from
* @hdr_size: the size of the batman-adv header encapsulating the packet
*
- * If the packet embedded in the skb is vlan tagged this function returns the
- * VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS is returned.
+ * Return: If the packet embedded in the skb is vlan tagged this function
+ * returns the VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS
+ * is returned.
*/
static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size)
{
@@ -927,7 +948,7 @@ static unsigned short batadv_dat_get_vid(struct sk_buff *skb, int *hdr_size)
* @bat_priv: the bat priv with all the soft interface information
* @skb: packet to check
*
- * Returns true if the message has been sent to the dht candidates, false
+ * Return: true if the message has been sent to the dht candidates, false
* otherwise. In case of a positive return value the message has to be enqueued
* to permit the fallback.
*/
@@ -1006,7 +1027,7 @@ bool batadv_dat_snoop_outgoing_arp_request(struct batadv_priv *bat_priv,
}
out:
if (dat_entry)
- batadv_dat_entry_free_ref(dat_entry);
+ batadv_dat_entry_put(dat_entry);
return ret;
}
@@ -1017,7 +1038,7 @@ out:
* @skb: packet to check
* @hdr_size: size of the encapsulation header
*
- * Returns true if the request has been answered, false otherwise.
+ * Return: true if the request has been answered, false otherwise.
*/
bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
struct sk_buff *skb, int hdr_size)
@@ -1086,7 +1107,7 @@ bool batadv_dat_snoop_incoming_arp_request(struct batadv_priv *bat_priv,
}
out:
if (dat_entry)
- batadv_dat_entry_free_ref(dat_entry);
+ batadv_dat_entry_put(dat_entry);
if (ret)
kfree_skb(skb);
return ret;
@@ -1140,7 +1161,7 @@ void batadv_dat_snoop_outgoing_arp_reply(struct batadv_priv *bat_priv,
* @skb: packet to check
* @hdr_size: size of the encapsulation header
*
- * Returns true if the packet was snooped and consumed by DAT. False if the
+ * Return: true if the packet was snooped and consumed by DAT. False if the
* packet has to be delivered to the interface
*/
bool batadv_dat_snoop_incoming_arp_reply(struct batadv_priv *bat_priv,
@@ -1197,7 +1218,7 @@ out:
* @bat_priv: the bat priv with all the soft interface information
* @forw_packet: the broadcast packet
*
- * Returns true if the node can drop the packet, false otherwise.
+ * Return: true if the node can drop the packet, false otherwise.
*/
bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
struct batadv_forw_packet *forw_packet)
@@ -1239,6 +1260,6 @@ bool batadv_dat_drop_broadcast_packet(struct batadv_priv *bat_priv,
out:
if (dat_entry)
- batadv_dat_entry_free_ref(dat_entry);
+ batadv_dat_entry_put(dat_entry);
return ret;
}
diff --git a/net/batman-adv/distributed-arp-table.h b/net/batman-adv/distributed-arp-table.h
index 26d4a525a798..813ecea96cf9 100644
--- a/net/batman-adv/distributed-arp-table.h
+++ b/net/batman-adv/distributed-arp-table.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2011-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2011-2016 B.A.T.M.A.N. contributors:
*
* Antonio Quartulli
*
diff --git a/net/batman-adv/fragmentation.c b/net/batman-adv/fragmentation.c
index 700c96c82a15..e6956d0746a2 100644
--- a/net/batman-adv/fragmentation.c
+++ b/net/batman-adv/fragmentation.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll <martin@hundeboll.net>
*
@@ -71,21 +71,21 @@ void batadv_frag_purge_orig(struct batadv_orig_node *orig_node,
for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) {
chain = &orig_node->fragments[i];
- spin_lock_bh(&orig_node->fragments[i].lock);
+ spin_lock_bh(&chain->lock);
if (!check_cb || check_cb(chain)) {
- batadv_frag_clear_chain(&orig_node->fragments[i].head);
- orig_node->fragments[i].size = 0;
+ batadv_frag_clear_chain(&chain->head);
+ chain->size = 0;
}
- spin_unlock_bh(&orig_node->fragments[i].lock);
+ spin_unlock_bh(&chain->lock);
}
}
/**
* batadv_frag_size_limit - maximum possible size of packet to be fragmented
*
- * Returns the maximum size of payload that can be fragmented.
+ * Return: the maximum size of payload that can be fragmented.
*/
static int batadv_frag_size_limit(void)
{
@@ -107,7 +107,7 @@ static int batadv_frag_size_limit(void)
*
* Caller must hold chain->lock.
*
- * Returns true if chain is empty and caller can just insert the new fragment
+ * Return: true if chain is empty and caller can just insert the new fragment
* without searching for the right position.
*/
static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain,
@@ -136,7 +136,7 @@ static bool batadv_frag_init_chain(struct batadv_frag_table_entry *chain,
* Insert a new fragment into the reverse ordered chain in the right table
* entry. The hash table entry is cleared if "old" fragments exist in it.
*
- * Returns true if skb is buffered, false on error. If the chain has all the
+ * Return: true if skb is buffered, false on error. If the chain has all the
* fragments needed to merge the packet, the chain is moved to the passed head
* to avoid locking the chain in the table.
*/
@@ -242,12 +242,11 @@ err:
/**
* batadv_frag_merge_packets - merge a chain of fragments
* @chain: head of chain with fragments
- * @skb: packet with total size of skb after merging
*
* Expand the first skb in the chain and copy the content of the remaining
* skb's into the expanded one. After doing so, clear the chain.
*
- * Returns the merged skb or NULL on error.
+ * Return: the merged skb or NULL on error.
*/
static struct sk_buff *
batadv_frag_merge_packets(struct hlist_head *chain)
@@ -307,6 +306,9 @@ free:
* There are three possible outcomes: 1) Packet is merged: Return true and
* set *skb to merged packet; 2) Packet is buffered: Return true and set *skb
* to NULL; 3) Error: Return false and leave skb as is.
+ *
+ * Return: true when packet is merged or buffered, false when skb is not not
+ * used.
*/
bool batadv_frag_skb_buffer(struct sk_buff **skb,
struct batadv_orig_node *orig_node_src)
@@ -344,7 +346,7 @@ out_err:
* will exceed the MTU towards the next-hop. If so, the fragment is forwarded
* without merging it.
*
- * Returns true if the fragment is consumed/forwarded, false otherwise.
+ * Return: true if the fragment is consumed/forwarded, false otherwise.
*/
bool batadv_frag_skb_fwd(struct sk_buff *skb,
struct batadv_hard_iface *recv_if,
@@ -376,16 +378,15 @@ bool batadv_frag_skb_fwd(struct sk_buff *skb,
skb->len + ETH_HLEN);
packet->ttl--;
- batadv_send_skb_packet(skb, neigh_node->if_incoming,
- neigh_node->addr);
+ batadv_send_unicast_skb(skb, neigh_node);
ret = true;
}
out:
if (orig_node_dst)
- batadv_orig_node_free_ref(orig_node_dst);
+ batadv_orig_node_put(orig_node_dst);
if (neigh_node)
- batadv_neigh_node_free_ref(neigh_node);
+ batadv_neigh_node_put(neigh_node);
return ret;
}
@@ -399,7 +400,7 @@ out:
* passed mtu and the old one with the rest. The new skb contains data from the
* tail of the old skb.
*
- * Returns the new fragment, NULL on error.
+ * Return: the new fragment, NULL on error.
*/
static struct sk_buff *batadv_frag_create(struct sk_buff *skb,
struct batadv_frag_packet *frag_head,
@@ -433,7 +434,7 @@ err:
* @orig_node: final destination of the created fragments
* @neigh_node: next-hop of the created fragments
*
- * Returns true on success, false otherwise.
+ * Return: true on success, false otherwise.
*/
bool batadv_frag_send_packet(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
@@ -484,8 +485,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb,
batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX);
batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
skb_fragment->len + ETH_HLEN);
- batadv_send_skb_packet(skb_fragment, neigh_node->if_incoming,
- neigh_node->addr);
+ batadv_send_unicast_skb(skb_fragment, neigh_node);
frag_header.no++;
/* The initial check in this function should cover this case */
@@ -504,13 +504,13 @@ bool batadv_frag_send_packet(struct sk_buff *skb,
batadv_inc_counter(bat_priv, BATADV_CNT_FRAG_TX);
batadv_add_counter(bat_priv, BATADV_CNT_FRAG_TX_BYTES,
skb->len + ETH_HLEN);
- batadv_send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
+ batadv_send_unicast_skb(skb, neigh_node);
ret = true;
out_err:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
return ret;
}
diff --git a/net/batman-adv/fragmentation.h b/net/batman-adv/fragmentation.h
index 8b9877e70b95..9ff77c7ef7c7 100644
--- a/net/batman-adv/fragmentation.h
+++ b/net/batman-adv/fragmentation.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2013-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2013-2016 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll <martin@hundeboll.net>
*
@@ -42,7 +42,7 @@ bool batadv_frag_send_packet(struct sk_buff *skb,
* batadv_frag_check_entry - check if a list of fragments has timed out
* @frags_entry: table entry to check
*
- * Returns true if the frags entry has timed out, false otherwise.
+ * Return: true if the frags entry has timed out, false otherwise.
*/
static inline bool
batadv_frag_check_entry(struct batadv_frag_table_entry *frags_entry)
diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index e6c8382c79ba..c59aff5ccac8 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -28,6 +28,7 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/kernel.h>
+#include <linux/kref.h>
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/rculist.h>
@@ -59,12 +60,28 @@
*/
#define BATADV_DHCP_CHADDR_OFFSET 28
-static void batadv_gw_node_free_ref(struct batadv_gw_node *gw_node)
+/**
+ * batadv_gw_node_release - release gw_node from lists and queue for free after
+ * rcu grace period
+ * @ref: kref pointer of the gw_node
+ */
+static void batadv_gw_node_release(struct kref *ref)
{
- if (atomic_dec_and_test(&gw_node->refcount)) {
- batadv_orig_node_free_ref(gw_node->orig_node);
- kfree_rcu(gw_node, rcu);
- }
+ struct batadv_gw_node *gw_node;
+
+ gw_node = container_of(ref, struct batadv_gw_node, refcount);
+
+ batadv_orig_node_put(gw_node->orig_node);
+ kfree_rcu(gw_node, rcu);
+}
+
+/**
+ * batadv_gw_node_put - decrement the gw_node refcounter and possibly release it
+ * @gw_node: gateway node to free
+ */
+static void batadv_gw_node_put(struct batadv_gw_node *gw_node)
+{
+ kref_put(&gw_node->refcount, batadv_gw_node_release);
}
static struct batadv_gw_node *
@@ -77,7 +94,7 @@ batadv_gw_get_selected_gw_node(struct batadv_priv *bat_priv)
if (!gw_node)
goto out;
- if (!atomic_inc_not_zero(&gw_node->refcount))
+ if (!kref_get_unless_zero(&gw_node->refcount))
gw_node = NULL;
out:
@@ -100,14 +117,14 @@ batadv_gw_get_selected_orig(struct batadv_priv *bat_priv)
if (!orig_node)
goto unlock;
- if (!atomic_inc_not_zero(&orig_node->refcount))
+ if (!kref_get_unless_zero(&orig_node->refcount))
orig_node = NULL;
unlock:
rcu_read_unlock();
out:
if (gw_node)
- batadv_gw_node_free_ref(gw_node);
+ batadv_gw_node_put(gw_node);
return orig_node;
}
@@ -118,14 +135,14 @@ static void batadv_gw_select(struct batadv_priv *bat_priv,
spin_lock_bh(&bat_priv->gw.list_lock);
- if (new_gw_node && !atomic_inc_not_zero(&new_gw_node->refcount))
+ if (new_gw_node && !kref_get_unless_zero(&new_gw_node->refcount))
new_gw_node = NULL;
curr_gw_node = rcu_dereference_protected(bat_priv->gw.curr_gw, 1);
rcu_assign_pointer(bat_priv->gw.curr_gw, new_gw_node);
if (curr_gw_node)
- batadv_gw_node_free_ref(curr_gw_node);
+ batadv_gw_node_put(curr_gw_node);
spin_unlock_bh(&bat_priv->gw.list_lock);
}
@@ -170,7 +187,7 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
if (!router_ifinfo)
goto next;
- if (!atomic_inc_not_zero(&gw_node->refcount))
+ if (!kref_get_unless_zero(&gw_node->refcount))
goto next;
tq_avg = router_ifinfo->bat_iv.tq_avg;
@@ -186,9 +203,9 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
((tmp_gw_factor == max_gw_factor) &&
(tq_avg > max_tq))) {
if (curr_gw)
- batadv_gw_node_free_ref(curr_gw);
+ batadv_gw_node_put(curr_gw);
curr_gw = gw_node;
- atomic_inc(&curr_gw->refcount);
+ kref_get(&curr_gw->refcount);
}
break;
@@ -201,9 +218,9 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
*/
if (tq_avg > max_tq) {
if (curr_gw)
- batadv_gw_node_free_ref(curr_gw);
+ batadv_gw_node_put(curr_gw);
curr_gw = gw_node;
- atomic_inc(&curr_gw->refcount);
+ kref_get(&curr_gw->refcount);
}
break;
}
@@ -214,12 +231,12 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
if (tmp_gw_factor > max_gw_factor)
max_gw_factor = tmp_gw_factor;
- batadv_gw_node_free_ref(gw_node);
+ batadv_gw_node_put(gw_node);
next:
- batadv_neigh_node_free_ref(router);
+ batadv_neigh_node_put(router);
if (router_ifinfo)
- batadv_neigh_ifinfo_free_ref(router_ifinfo);
+ batadv_neigh_ifinfo_put(router_ifinfo);
}
rcu_read_unlock();
@@ -255,7 +272,7 @@ void batadv_gw_check_client_stop(struct batadv_priv *bat_priv)
*/
batadv_throw_uevent(bat_priv, BATADV_UEV_GW, BATADV_UEV_DEL, NULL);
- batadv_gw_node_free_ref(curr_gw);
+ batadv_gw_node_put(curr_gw);
}
void batadv_gw_election(struct batadv_priv *bat_priv)
@@ -330,13 +347,13 @@ void batadv_gw_election(struct batadv_priv *bat_priv)
out:
if (curr_gw)
- batadv_gw_node_free_ref(curr_gw);
+ batadv_gw_node_put(curr_gw);
if (next_gw)
- batadv_gw_node_free_ref(next_gw);
+ batadv_gw_node_put(next_gw);
if (router)
- batadv_neigh_node_free_ref(router);
+ batadv_neigh_node_put(router);
if (router_ifinfo)
- batadv_neigh_ifinfo_free_ref(router_ifinfo);
+ batadv_neigh_ifinfo_put(router_ifinfo);
}
void batadv_gw_check_election(struct batadv_priv *bat_priv,
@@ -397,15 +414,15 @@ reselect:
batadv_gw_reselect(bat_priv);
out:
if (curr_gw_orig)
- batadv_orig_node_free_ref(curr_gw_orig);
+ batadv_orig_node_put(curr_gw_orig);
if (router_gw)
- batadv_neigh_node_free_ref(router_gw);
+ batadv_neigh_node_put(router_gw);
if (router_orig)
- batadv_neigh_node_free_ref(router_orig);
+ batadv_neigh_node_put(router_orig);
if (router_gw_tq)
- batadv_neigh_ifinfo_free_ref(router_gw_tq);
+ batadv_neigh_ifinfo_put(router_gw_tq);
if (router_orig_tq)
- batadv_neigh_ifinfo_free_ref(router_orig_tq);
+ batadv_neigh_ifinfo_put(router_orig_tq);
}
/**
@@ -423,12 +440,12 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
if (gateway->bandwidth_down == 0)
return;
- if (!atomic_inc_not_zero(&orig_node->refcount))
+ if (!kref_get_unless_zero(&orig_node->refcount))
return;
gw_node = kzalloc(sizeof(*gw_node), GFP_ATOMIC);
if (!gw_node) {
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
return;
}
@@ -436,7 +453,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
gw_node->orig_node = orig_node;
gw_node->bandwidth_down = ntohl(gateway->bandwidth_down);
gw_node->bandwidth_up = ntohl(gateway->bandwidth_up);
- atomic_set(&gw_node->refcount, 1);
+ kref_init(&gw_node->refcount);
spin_lock_bh(&bat_priv->gw.list_lock);
hlist_add_head_rcu(&gw_node->list, &bat_priv->gw.list);
@@ -456,7 +473,7 @@ static void batadv_gw_node_add(struct batadv_priv *bat_priv,
* @bat_priv: the bat priv with all the soft interface information
* @orig_node: originator announcing gateway capabilities
*
- * Returns gateway node if found or NULL otherwise.
+ * Return: gateway node if found or NULL otherwise.
*/
static struct batadv_gw_node *
batadv_gw_node_get(struct batadv_priv *bat_priv,
@@ -469,7 +486,7 @@ batadv_gw_node_get(struct batadv_priv *bat_priv,
if (gw_node_tmp->orig_node != orig_node)
continue;
- if (!atomic_inc_not_zero(&gw_node_tmp->refcount))
+ if (!kref_get_unless_zero(&gw_node_tmp->refcount))
continue;
gw_node = gw_node_tmp;
@@ -527,22 +544,23 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
* gets dereferenced.
*/
spin_lock_bh(&bat_priv->gw.list_lock);
- hlist_del_init_rcu(&gw_node->list);
+ if (!hlist_unhashed(&gw_node->list)) {
+ hlist_del_init_rcu(&gw_node->list);
+ batadv_gw_node_put(gw_node);
+ }
spin_unlock_bh(&bat_priv->gw.list_lock);
- batadv_gw_node_free_ref(gw_node);
-
curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
if (gw_node == curr_gw)
batadv_gw_reselect(bat_priv);
if (curr_gw)
- batadv_gw_node_free_ref(curr_gw);
+ batadv_gw_node_put(curr_gw);
}
out:
if (gw_node)
- batadv_gw_node_free_ref(gw_node);
+ batadv_gw_node_put(gw_node);
}
void batadv_gw_node_delete(struct batadv_priv *bat_priv,
@@ -565,7 +583,7 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv)
hlist_for_each_entry_safe(gw_node, node_tmp,
&bat_priv->gw.list, list) {
hlist_del_init_rcu(&gw_node->list);
- batadv_gw_node_free_ref(gw_node);
+ batadv_gw_node_put(gw_node);
}
spin_unlock_bh(&bat_priv->gw.list_lock);
}
@@ -602,12 +620,12 @@ static int batadv_write_buffer_text(struct batadv_priv *bat_priv,
ret = seq_has_overflowed(seq) ? -1 : 0;
if (curr_gw)
- batadv_gw_node_free_ref(curr_gw);
+ batadv_gw_node_put(curr_gw);
out:
if (router_ifinfo)
- batadv_neigh_ifinfo_free_ref(router_ifinfo);
+ batadv_neigh_ifinfo_put(router_ifinfo);
if (router)
- batadv_neigh_node_free_ref(router);
+ batadv_neigh_node_put(router);
return ret;
}
@@ -644,7 +662,7 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, void *offset)
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
return 0;
}
@@ -655,13 +673,13 @@ out:
* @chaddr: buffer where the client address will be stored. Valid
* only if the function returns BATADV_DHCP_TO_CLIENT
*
- * Returns:
+ * This function may re-allocate the data buffer of the skb passed as argument.
+ *
+ * Return:
* - BATADV_DHCP_NO if the packet is not a dhcp message or if there was an error
* while parsing it
* - BATADV_DHCP_TO_SERVER if this is a message going to the DHCP server
* - BATADV_DHCP_TO_CLIENT if this is a message going to a DHCP client
- *
- * This function may re-allocate the data buffer of the skb passed as argument.
*/
enum batadv_dhcp_recipient
batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
@@ -776,11 +794,11 @@ batadv_gw_dhcp_recipient_get(struct sk_buff *skb, unsigned int *header_len,
* server. Due to topology changes it may be the case that the GW server
* previously selected is not the best one anymore.
*
- * Returns true if the packet destination is unicast and it is not the best gw,
- * false otherwise.
- *
* This call might reallocate skb data.
* Must be invoked only when the DHCP packet is going TO a DHCP SERVER.
+ *
+ * Return: true if the packet destination is unicast and it is not the best gw,
+ * false otherwise.
*/
bool batadv_gw_out_of_range(struct batadv_priv *bat_priv,
struct sk_buff *skb)
@@ -838,7 +856,7 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv,
goto out;
curr_tq_avg = curr_ifinfo->bat_iv.tq_avg;
- batadv_neigh_ifinfo_free_ref(curr_ifinfo);
+ batadv_neigh_ifinfo_put(curr_ifinfo);
break;
case BATADV_GW_MODE_OFF:
@@ -856,18 +874,18 @@ bool batadv_gw_out_of_range(struct batadv_priv *bat_priv,
if ((curr_tq_avg - old_ifinfo->bat_iv.tq_avg) > BATADV_GW_THRESHOLD)
out_of_range = true;
- batadv_neigh_ifinfo_free_ref(old_ifinfo);
+ batadv_neigh_ifinfo_put(old_ifinfo);
out:
if (orig_dst_node)
- batadv_orig_node_free_ref(orig_dst_node);
+ batadv_orig_node_put(orig_dst_node);
if (curr_gw)
- batadv_gw_node_free_ref(curr_gw);
+ batadv_gw_node_put(curr_gw);
if (gw_node)
- batadv_gw_node_free_ref(gw_node);
+ batadv_gw_node_put(gw_node);
if (neigh_old)
- batadv_neigh_node_free_ref(neigh_old);
+ batadv_neigh_node_put(neigh_old);
if (neigh_curr)
- batadv_neigh_node_free_ref(neigh_curr);
+ batadv_neigh_node_put(neigh_curr);
return out_of_range;
}
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index fa9527785ed3..582dd8c413c8 100644
--- a/net/batman-adv/gateway_client.h
+++ b/net/batman-adv/gateway_client.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/gateway_common.c b/net/batman-adv/gateway_common.c
index 0cb5e6b6f6d4..4423047889e1 100644
--- a/net/batman-adv/gateway_common.c
+++ b/net/batman-adv/gateway_common.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -31,27 +31,23 @@
#include "packet.h"
/**
- * batadv_parse_gw_bandwidth - parse supplied string buffer to extract download
- * and upload bandwidth information
+ * batadv_parse_throughput - parse supplied string buffer to extract throughput
+ * information
* @net_dev: the soft interface net device
* @buff: string buffer to parse
- * @down: pointer holding the returned download bandwidth information
- * @up: pointer holding the returned upload bandwidth information
+ * @description: text shown when throughput string cannot be parsed
+ * @throughput: pointer holding the returned throughput information
*
- * Returns false on parse error and true otherwise.
+ * Return: false on parse error and true otherwise.
*/
-static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff,
- u32 *down, u32 *up)
+bool batadv_parse_throughput(struct net_device *net_dev, char *buff,
+ const char *description, u32 *throughput)
{
enum batadv_bandwidth_units bw_unit_type = BATADV_BW_UNIT_KBIT;
- char *slash_ptr, *tmp_ptr;
- u64 ldown, lup;
+ u64 lthroughput;
+ char *tmp_ptr;
int ret;
- slash_ptr = strchr(buff, '/');
- if (slash_ptr)
- *slash_ptr = 0;
-
if (strlen(buff) > 4) {
tmp_ptr = buff + strlen(buff) - 4;
@@ -63,90 +59,75 @@ static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff,
*tmp_ptr = '\0';
}
- ret = kstrtou64(buff, 10, &ldown);
+ ret = kstrtou64(buff, 10, &lthroughput);
if (ret) {
batadv_err(net_dev,
- "Download speed of gateway mode invalid: %s\n",
- buff);
+ "Invalid throughput speed for %s: %s\n",
+ description, buff);
return false;
}
switch (bw_unit_type) {
case BATADV_BW_UNIT_MBIT:
/* prevent overflow */
- if (U64_MAX / 10 < ldown) {
+ if (U64_MAX / 10 < lthroughput) {
batadv_err(net_dev,
- "Download speed of gateway mode too large: %s\n",
- buff);
+ "Throughput speed for %s too large: %s\n",
+ description, buff);
return false;
}
- ldown *= 10;
+ lthroughput *= 10;
break;
case BATADV_BW_UNIT_KBIT:
default:
- ldown = div_u64(ldown, 100);
+ lthroughput = div_u64(lthroughput, 100);
break;
}
- if (U32_MAX < ldown) {
+ if (lthroughput > U32_MAX) {
batadv_err(net_dev,
- "Download speed of gateway mode too large: %s\n",
- buff);
+ "Throughput speed for %s too large: %s\n",
+ description, buff);
return false;
}
- *down = ldown;
-
- /* we also got some upload info */
- if (slash_ptr) {
- bw_unit_type = BATADV_BW_UNIT_KBIT;
-
- if (strlen(slash_ptr + 1) > 4) {
- tmp_ptr = slash_ptr + 1 - 4 + strlen(slash_ptr + 1);
+ *throughput = lthroughput;
- if (strncasecmp(tmp_ptr, "mbit", 4) == 0)
- bw_unit_type = BATADV_BW_UNIT_MBIT;
+ return true;
+}
- if ((strncasecmp(tmp_ptr, "kbit", 4) == 0) ||
- (bw_unit_type == BATADV_BW_UNIT_MBIT))
- *tmp_ptr = '\0';
- }
+/**
+ * batadv_parse_gw_bandwidth - parse supplied string buffer to extract download
+ * and upload bandwidth information
+ * @net_dev: the soft interface net device
+ * @buff: string buffer to parse
+ * @down: pointer holding the returned download bandwidth information
+ * @up: pointer holding the returned upload bandwidth information
+ *
+ * Return: false on parse error and true otherwise.
+ */
+static bool batadv_parse_gw_bandwidth(struct net_device *net_dev, char *buff,
+ u32 *down, u32 *up)
+{
+ char *slash_ptr;
+ bool ret;
- ret = kstrtou64(slash_ptr + 1, 10, &lup);
- if (ret) {
- batadv_err(net_dev,
- "Upload speed of gateway mode invalid: %s\n",
- slash_ptr + 1);
- return false;
- }
+ slash_ptr = strchr(buff, '/');
+ if (slash_ptr)
+ *slash_ptr = 0;
- switch (bw_unit_type) {
- case BATADV_BW_UNIT_MBIT:
- /* prevent overflow */
- if (U64_MAX / 10 < lup) {
- batadv_err(net_dev,
- "Upload speed of gateway mode too large: %s\n",
- slash_ptr + 1);
- return false;
- }
-
- lup *= 10;
- break;
- case BATADV_BW_UNIT_KBIT:
- default:
- lup = div_u64(lup, 100);
- break;
- }
+ ret = batadv_parse_throughput(net_dev, buff, "download gateway speed",
+ down);
+ if (!ret)
+ return false;
- if (U32_MAX < lup) {
- batadv_err(net_dev,
- "Upload speed of gateway mode too large: %s\n",
- slash_ptr + 1);
+ /* we also got some upload info */
+ if (slash_ptr) {
+ ret = batadv_parse_throughput(net_dev, slash_ptr + 1,
+ "upload gateway speed", up);
+ if (!ret)
return false;
- }
-
- *up = lup;
}
return true;
diff --git a/net/batman-adv/gateway_common.h b/net/batman-adv/gateway_common.h
index ab893e318229..8a5e1ddf1175 100644
--- a/net/batman-adv/gateway_common.h
+++ b/net/batman-adv/gateway_common.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -49,5 +49,7 @@ ssize_t batadv_gw_bandwidth_set(struct net_device *net_dev, char *buff,
void batadv_gw_tvlv_container_update(struct batadv_priv *bat_priv);
void batadv_gw_init(struct batadv_priv *bat_priv);
void batadv_gw_free(struct batadv_priv *bat_priv);
+bool batadv_parse_throughput(struct net_device *net_dev, char *buff,
+ const char *description, u32 *throughput);
#endif /* _NET_BATMAN_ADV_GATEWAY_COMMON_H_ */
diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index f11345e163d7..b22b2775a0a5 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -18,6 +18,7 @@
#include "hard-interface.h"
#include "main.h"
+#include <linux/atomic.h>
#include <linux/bug.h>
#include <linux/byteorder/generic.h>
#include <linux/errno.h>
@@ -26,12 +27,14 @@
#include <linux/if_ether.h>
#include <linux/if.h>
#include <linux/kernel.h>
+#include <linux/kref.h>
#include <linux/list.h>
#include <linux/netdevice.h>
#include <linux/printk.h>
#include <linux/rculist.h>
#include <linux/rtnetlink.h>
#include <linux/slab.h>
+#include <linux/spinlock.h>
#include <linux/workqueue.h>
#include <net/net_namespace.h>
@@ -46,13 +49,19 @@
#include "sysfs.h"
#include "translation-table.h"
-void batadv_hardif_free_rcu(struct rcu_head *rcu)
+/**
+ * batadv_hardif_release - release hard interface from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the hard interface
+ */
+void batadv_hardif_release(struct kref *ref)
{
struct batadv_hard_iface *hard_iface;
- hard_iface = container_of(rcu, struct batadv_hard_iface, rcu);
+ hard_iface = container_of(ref, struct batadv_hard_iface, refcount);
dev_put(hard_iface->net_dev);
- kfree(hard_iface);
+
+ kfree_rcu(hard_iface, rcu);
}
struct batadv_hard_iface *
@@ -63,7 +72,7 @@ batadv_hardif_get_by_netdev(const struct net_device *net_dev)
rcu_read_lock();
list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
if (hard_iface->net_dev == net_dev &&
- atomic_inc_not_zero(&hard_iface->refcount))
+ kref_get_unless_zero(&hard_iface->refcount))
goto out;
}
@@ -75,6 +84,28 @@ out:
}
/**
+ * batadv_mutual_parents - check if two devices are each others parent
+ * @dev1: 1st net_device
+ * @dev2: 2nd net_device
+ *
+ * veth devices come in pairs and each is the parent of the other!
+ *
+ * Return: true if the devices are each others parent, otherwise false
+ */
+static bool batadv_mutual_parents(const struct net_device *dev1,
+ const struct net_device *dev2)
+{
+ int dev1_parent_iflink = dev_get_iflink(dev1);
+ int dev2_parent_iflink = dev_get_iflink(dev2);
+
+ if (!dev1_parent_iflink || !dev2_parent_iflink)
+ return false;
+
+ return (dev1_parent_iflink == dev2->ifindex) &&
+ (dev2_parent_iflink == dev1->ifindex);
+}
+
+/**
* batadv_is_on_batman_iface - check if a device is a batman iface descendant
* @net_dev: the device to check
*
@@ -84,7 +115,7 @@ out:
* This function recursively checks all the fathers of the device passed as
* argument looking for a batman-adv soft interface.
*
- * Returns true if the device is descendant of a batman-adv mesh interface (or
+ * Return: true if the device is descendant of a batman-adv mesh interface (or
* if it is a batman-adv interface itself), false otherwise
*/
static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
@@ -107,6 +138,9 @@ static bool batadv_is_on_batman_iface(const struct net_device *net_dev)
if (WARN(!parent_dev, "Cannot find parent device"))
return false;
+ if (batadv_mutual_parents(net_dev, parent_dev))
+ return false;
+
ret = batadv_is_on_batman_iface(parent_dev);
return ret;
@@ -135,7 +169,7 @@ static int batadv_is_valid_iface(const struct net_device *net_dev)
* interface
* @net_device: the device to check
*
- * Returns true if the net device is a 802.11 wireless device, false otherwise.
+ * Return: true if the net device is a 802.11 wireless device, false otherwise.
*/
bool batadv_is_wifi_netdev(struct net_device *net_device)
{
@@ -168,7 +202,7 @@ batadv_hardif_get_active(const struct net_device *soft_iface)
continue;
if (hard_iface->if_status == BATADV_IF_ACTIVE &&
- atomic_inc_not_zero(&hard_iface->refcount))
+ kref_get_unless_zero(&hard_iface->refcount))
goto out;
}
@@ -192,7 +226,7 @@ static void batadv_primary_if_update_addr(struct batadv_priv *bat_priv,
batadv_bla_update_orig_address(bat_priv, primary_if, oldif);
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
}
static void batadv_primary_if_select(struct batadv_priv *bat_priv,
@@ -202,7 +236,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv,
ASSERT_RTNL();
- if (new_hard_iface && !atomic_inc_not_zero(&new_hard_iface->refcount))
+ if (new_hard_iface && !kref_get_unless_zero(&new_hard_iface->refcount))
new_hard_iface = NULL;
curr_hard_iface = rcu_dereference_protected(bat_priv->primary_if, 1);
@@ -216,7 +250,7 @@ static void batadv_primary_if_select(struct batadv_priv *bat_priv,
out:
if (curr_hard_iface)
- batadv_hardif_free_ref(curr_hard_iface);
+ batadv_hardif_put(curr_hard_iface);
}
static bool
@@ -375,7 +409,7 @@ batadv_hardif_activate_interface(struct batadv_hard_iface *hard_iface)
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
}
static void
@@ -400,7 +434,8 @@ batadv_hardif_deactivate_interface(struct batadv_hard_iface *hard_iface)
*
* Invoke ndo_del_slave on master passing slave as argument. In this way slave
* is free'd and master can correctly change its internal state.
- * Return 0 on success, a negative value representing the error otherwise
+ *
+ * Return: 0 on success, a negative value representing the error otherwise
*/
static int batadv_master_del_slave(struct batadv_hard_iface *slave,
struct net_device *master)
@@ -429,7 +464,7 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
if (hard_iface->if_status != BATADV_IF_NOT_IN_USE)
goto out;
- if (!atomic_inc_not_zero(&hard_iface->refcount))
+ if (!kref_get_unless_zero(&hard_iface->refcount))
goto out;
soft_iface = dev_get_by_name(&init_net, iface_name);
@@ -464,7 +499,8 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface *hard_iface,
hard_iface->soft_iface = soft_iface;
bat_priv = netdev_priv(hard_iface->soft_iface);
- ret = netdev_master_upper_dev_link(hard_iface->net_dev, soft_iface);
+ ret = netdev_master_upper_dev_link(hard_iface->net_dev,
+ soft_iface, NULL, NULL);
if (ret)
goto err_dev;
@@ -526,7 +562,7 @@ err_dev:
hard_iface->soft_iface = NULL;
dev_put(soft_iface);
err:
- batadv_hardif_free_ref(hard_iface);
+ batadv_hardif_put(hard_iface);
return ret;
}
@@ -557,7 +593,7 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
batadv_primary_if_select(bat_priv, new_if);
if (new_if)
- batadv_hardif_free_ref(new_if);
+ batadv_hardif_put(new_if);
}
bat_priv->bat_algo_ops->bat_iface_disable(hard_iface);
@@ -580,11 +616,11 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
}
hard_iface->soft_iface = NULL;
- batadv_hardif_free_ref(hard_iface);
+ batadv_hardif_put(hard_iface);
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
}
/**
@@ -603,7 +639,7 @@ static void batadv_hardif_remove_interface_finish(struct work_struct *work)
batadv_debugfs_del_hardif(hard_iface);
batadv_sysfs_del_hardif(&hard_iface->hardif_obj);
- batadv_hardif_free_ref(hard_iface);
+ batadv_hardif_put(hard_iface);
}
static struct batadv_hard_iface *
@@ -638,15 +674,19 @@ batadv_hardif_add_interface(struct net_device *net_dev)
goto free_sysfs;
INIT_LIST_HEAD(&hard_iface->list);
+ INIT_HLIST_HEAD(&hard_iface->neigh_list);
INIT_WORK(&hard_iface->cleanup_work,
batadv_hardif_remove_interface_finish);
+ spin_lock_init(&hard_iface->neigh_list_lock);
+
hard_iface->num_bcasts = BATADV_NUM_BCASTS_DEFAULT;
if (batadv_is_wifi_netdev(net_dev))
hard_iface->num_bcasts = BATADV_NUM_BCASTS_WIRELESS;
/* extra reference for return */
- atomic_set(&hard_iface->refcount, 2);
+ kref_init(&hard_iface->refcount);
+ kref_get(&hard_iface->refcount);
batadv_check_known_mac_addr(hard_iface->net_dev);
list_add_tail_rcu(&hard_iface->list, &batadv_hardif_list);
@@ -708,7 +748,8 @@ static int batadv_hard_if_event(struct notifier_block *this,
}
hard_iface = batadv_hardif_get_by_netdev(net_dev);
- if (!hard_iface && event == NETDEV_REGISTER)
+ if (!hard_iface && (event == NETDEV_REGISTER ||
+ event == NETDEV_POST_TYPE_CHANGE))
hard_iface = batadv_hardif_add_interface(net_dev);
if (!hard_iface)
@@ -723,6 +764,7 @@ static int batadv_hard_if_event(struct notifier_block *this,
batadv_hardif_deactivate_interface(hard_iface);
break;
case NETDEV_UNREGISTER:
+ case NETDEV_PRE_TYPE_CHANGE:
list_del_rcu(&hard_iface->list);
batadv_hardif_remove_interface(hard_iface);
@@ -752,10 +794,10 @@ static int batadv_hard_if_event(struct notifier_block *this,
}
hardif_put:
- batadv_hardif_free_ref(hard_iface);
+ batadv_hardif_put(hard_iface);
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
return NOTIFY_DONE;
}
diff --git a/net/batman-adv/hard-interface.h b/net/batman-adv/hard-interface.h
index 5a31420513e1..d74f1983f33e 100644
--- a/net/batman-adv/hard-interface.h
+++ b/net/batman-adv/hard-interface.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -20,8 +20,8 @@
#include "main.h"
-#include <linux/atomic.h>
#include <linux/compiler.h>
+#include <linux/kref.h>
#include <linux/notifier.h>
#include <linux/rcupdate.h>
#include <linux/stddef.h>
@@ -61,30 +61,16 @@ void batadv_hardif_disable_interface(struct batadv_hard_iface *hard_iface,
void batadv_hardif_remove_interfaces(void);
int batadv_hardif_min_mtu(struct net_device *soft_iface);
void batadv_update_min_mtu(struct net_device *soft_iface);
-void batadv_hardif_free_rcu(struct rcu_head *rcu);
+void batadv_hardif_release(struct kref *ref);
/**
- * batadv_hardif_free_ref - decrement the hard interface refcounter and
- * possibly free it
+ * batadv_hardif_put - decrement the hard interface refcounter and possibly
+ * release it
* @hard_iface: the hard interface to free
*/
-static inline void
-batadv_hardif_free_ref(struct batadv_hard_iface *hard_iface)
+static inline void batadv_hardif_put(struct batadv_hard_iface *hard_iface)
{
- if (atomic_dec_and_test(&hard_iface->refcount))
- call_rcu(&hard_iface->rcu, batadv_hardif_free_rcu);
-}
-
-/**
- * batadv_hardif_free_ref_now - decrement the hard interface refcounter and
- * possibly free it (without rcu callback)
- * @hard_iface: the hard interface to free
- */
-static inline void
-batadv_hardif_free_ref_now(struct batadv_hard_iface *hard_iface)
-{
- if (atomic_dec_and_test(&hard_iface->refcount))
- batadv_hardif_free_rcu(&hard_iface->rcu);
+ kref_put(&hard_iface->refcount, batadv_hardif_release);
}
static inline struct batadv_hard_iface *
@@ -97,7 +83,7 @@ batadv_primary_if_get_selected(struct batadv_priv *bat_priv)
if (!hard_iface)
goto out;
- if (!atomic_inc_not_zero(&hard_iface->refcount))
+ if (!kref_get_unless_zero(&hard_iface->refcount))
hard_iface = NULL;
out:
diff --git a/net/batman-adv/hash.c b/net/batman-adv/hash.c
index 2ea6a18d793f..a0a0fdb85805 100644
--- a/net/batman-adv/hash.c
+++ b/net/batman-adv/hash.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
diff --git a/net/batman-adv/hash.h b/net/batman-adv/hash.h
index 377626250ac7..9bb57b87447c 100644
--- a/net/batman-adv/hash.h
+++ b/net/batman-adv/hash.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2006-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2006-2016 B.A.T.M.A.N. contributors:
*
* Simon Wunderlich, Marek Lindner
*
@@ -30,14 +30,17 @@
struct lock_class_key;
/* callback to a compare function. should compare 2 element datas for their
- * keys, return 0 if same and not 0 if not same
+ * keys
+ *
+ * Return: 0 if same and not 0 if not same
*/
typedef int (*batadv_hashdata_compare_cb)(const struct hlist_node *,
const void *);
-/* the hashfunction, should return an index
- * based on the key in the data of the first
- * argument and the size the second
+/* the hashfunction
+ *
+ * Return: an index based on the key in the data of the first argument and the
+ * size the second
*/
typedef u32 (*batadv_hashdata_choose_cb)(const void *, u32);
typedef void (*batadv_hashdata_free_cb)(struct hlist_node *, void *);
@@ -96,7 +99,7 @@ static inline void batadv_hash_delete(struct batadv_hashtable *hash,
* @data: data passed to the aforementioned callbacks as argument
* @data_node: to be added element
*
- * Returns 0 on success, 1 if the element already is in the hash
+ * Return: 0 on success, 1 if the element already is in the hash
* and -1 on error.
*/
static inline int batadv_hash_add(struct batadv_hashtable *hash,
@@ -139,10 +142,11 @@ out:
return ret;
}
-/* removes data from hash, if found. returns pointer do data on success, so you
- * can remove the used structure yourself, or NULL on error . data could be the
- * structure you use with just the key filled, we just need the key for
- * comparing.
+/* removes data from hash, if found. data could be the structure you use with
+ * just the key filled, we just need the key for comparing.
+ *
+ * Return: returns pointer do data on success, so you can remove the used
+ * structure yourself, or NULL on error
*/
static inline void *batadv_hash_remove(struct batadv_hashtable *hash,
batadv_hashdata_compare_cb compare,
diff --git a/net/batman-adv/icmp_socket.c b/net/batman-adv/icmp_socket.c
index bcabb5e3f4d3..14d0013b387e 100644
--- a/net/batman-adv/icmp_socket.c
+++ b/net/batman-adv/icmp_socket.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -278,7 +278,7 @@ static ssize_t batadv_socket_write(struct file *file, const char __user *buff,
ether_addr_copy(icmp_header->orig, primary_if->net_dev->dev_addr);
- batadv_send_skb_packet(skb, neigh_node->if_incoming, neigh_node->addr);
+ batadv_send_unicast_skb(skb, neigh_node);
goto out;
dst_unreach:
@@ -288,11 +288,11 @@ free_skb:
kfree_skb(skb);
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
if (neigh_node)
- batadv_neigh_node_free_ref(neigh_node);
+ batadv_neigh_node_put(neigh_node);
if (orig_node)
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
return len;
}
diff --git a/net/batman-adv/icmp_socket.h b/net/batman-adv/icmp_socket.h
index e937143f0b10..618d5de06f20 100644
--- a/net/batman-adv/icmp_socket.h
+++ b/net/batman-adv/icmp_socket.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/main.c b/net/batman-adv/main.c
index d7f17c1aa4a4..d64ddb961979 100644
--- a/net/batman-adv/main.c
+++ b/net/batman-adv/main.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -29,6 +29,7 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/kernel.h>
+#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/module.h>
@@ -86,6 +87,7 @@ static int __init batadv_init(void)
batadv_recv_handler_init();
+ batadv_v_init();
batadv_iv_init();
batadv_nc_init();
@@ -158,6 +160,10 @@ int batadv_mesh_init(struct net_device *soft_iface)
INIT_HLIST_HEAD(&bat_priv->tvlv.handler_list);
INIT_HLIST_HEAD(&bat_priv->softif_vlan_list);
+ ret = batadv_v_mesh_init(bat_priv);
+ if (ret < 0)
+ goto err;
+
ret = batadv_originator_init(bat_priv);
if (ret < 0)
goto err;
@@ -200,6 +206,8 @@ void batadv_mesh_free(struct net_device *soft_iface)
batadv_purge_outstanding_packets(bat_priv, NULL);
batadv_gw_node_free(bat_priv);
+
+ batadv_v_mesh_free(bat_priv);
batadv_nc_mesh_free(bat_priv);
batadv_dat_free(bat_priv);
batadv_bla_free(bat_priv);
@@ -233,7 +241,7 @@ void batadv_mesh_free(struct net_device *soft_iface)
* @bat_priv: the bat priv with all the soft interface information
* @addr: the address to check
*
- * Returns 'true' if the mac address was found, false otherwise.
+ * Return: 'true' if the mac address was found, false otherwise.
*/
bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr)
{
@@ -262,7 +270,7 @@ bool batadv_is_my_mac(struct batadv_priv *bat_priv, const u8 *addr)
* function that requires the primary interface
* @seq: debugfs table seq_file struct
*
- * Returns primary interface if found or NULL otherwise.
+ * Return: primary interface if found or NULL otherwise.
*/
struct batadv_hard_iface *
batadv_seq_print_text_primary_if_get(struct seq_file *seq)
@@ -286,7 +294,7 @@ batadv_seq_print_text_primary_if_get(struct seq_file *seq)
seq_printf(seq,
"BATMAN mesh %s disabled - primary interface not active\n",
net_dev->name);
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
primary_if = NULL;
out:
@@ -297,7 +305,7 @@ out:
* batadv_max_header_len - calculate maximum encapsulation overhead for a
* payload packet
*
- * Return the maximum encapsulation overhead in bytes.
+ * Return: the maximum encapsulation overhead in bytes.
*/
int batadv_max_header_len(void)
{
@@ -552,7 +560,7 @@ int batadv_algo_register(struct batadv_algo_ops *bat_algo_ops)
!bat_algo_ops->bat_ogm_schedule ||
!bat_algo_ops->bat_ogm_emit ||
!bat_algo_ops->bat_neigh_cmp ||
- !bat_algo_ops->bat_neigh_is_equiv_or_better) {
+ !bat_algo_ops->bat_neigh_is_similar_or_better) {
pr_info("Routing algo '%s' does not implement required ops\n",
bat_algo_ops->name);
return -EINVAL;
@@ -599,6 +607,8 @@ int batadv_algo_seq_print_text(struct seq_file *seq, void *offset)
*
* payload_ptr must always point to an address in the skb head buffer and not to
* a fragment.
+ *
+ * Return: big endian crc32c of the checksummed data
*/
__be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr)
{
@@ -622,15 +632,26 @@ __be32 batadv_skb_crc32(struct sk_buff *skb, u8 *payload_ptr)
}
/**
- * batadv_tvlv_handler_free_ref - decrement the tvlv handler refcounter and
- * possibly free it
+ * batadv_tvlv_handler_release - release tvlv handler from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the tvlv
+ */
+static void batadv_tvlv_handler_release(struct kref *ref)
+{
+ struct batadv_tvlv_handler *tvlv_handler;
+
+ tvlv_handler = container_of(ref, struct batadv_tvlv_handler, refcount);
+ kfree_rcu(tvlv_handler, rcu);
+}
+
+/**
+ * batadv_tvlv_handler_put - decrement the tvlv container refcounter and
+ * possibly release it
* @tvlv_handler: the tvlv handler to free
*/
-static void
-batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler)
+static void batadv_tvlv_handler_put(struct batadv_tvlv_handler *tvlv_handler)
{
- if (atomic_dec_and_test(&tvlv_handler->refcount))
- kfree_rcu(tvlv_handler, rcu);
+ kref_put(&tvlv_handler->refcount, batadv_tvlv_handler_release);
}
/**
@@ -640,7 +661,7 @@ batadv_tvlv_handler_free_ref(struct batadv_tvlv_handler *tvlv_handler)
* @type: tvlv handler type to look for
* @version: tvlv handler version to look for
*
- * Returns tvlv handler if found or NULL otherwise.
+ * Return: tvlv handler if found or NULL otherwise.
*/
static struct batadv_tvlv_handler
*batadv_tvlv_handler_get(struct batadv_priv *bat_priv, u8 type, u8 version)
@@ -656,7 +677,7 @@ static struct batadv_tvlv_handler
if (tvlv_handler_tmp->version != version)
continue;
- if (!atomic_inc_not_zero(&tvlv_handler_tmp->refcount))
+ if (!kref_get_unless_zero(&tvlv_handler_tmp->refcount))
continue;
tvlv_handler = tvlv_handler_tmp;
@@ -668,14 +689,25 @@ static struct batadv_tvlv_handler
}
/**
- * batadv_tvlv_container_free_ref - decrement the tvlv container refcounter and
- * possibly free it
+ * batadv_tvlv_container_release - release tvlv from lists and free
+ * @ref: kref pointer of the tvlv
+ */
+static void batadv_tvlv_container_release(struct kref *ref)
+{
+ struct batadv_tvlv_container *tvlv;
+
+ tvlv = container_of(ref, struct batadv_tvlv_container, refcount);
+ kfree(tvlv);
+}
+
+/**
+ * batadv_tvlv_container_put - decrement the tvlv container refcounter and
+ * possibly release it
* @tvlv: the tvlv container to free
*/
-static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv)
+static void batadv_tvlv_container_put(struct batadv_tvlv_container *tvlv)
{
- if (atomic_dec_and_test(&tvlv->refcount))
- kfree(tvlv);
+ kref_put(&tvlv->refcount, batadv_tvlv_container_release);
}
/**
@@ -688,13 +720,15 @@ static void batadv_tvlv_container_free_ref(struct batadv_tvlv_container *tvlv)
* Has to be called with the appropriate locks being acquired
* (tvlv.container_list_lock).
*
- * Returns tvlv container if found or NULL otherwise.
+ * Return: tvlv container if found or NULL otherwise.
*/
static struct batadv_tvlv_container
*batadv_tvlv_container_get(struct batadv_priv *bat_priv, u8 type, u8 version)
{
struct batadv_tvlv_container *tvlv_tmp, *tvlv = NULL;
+ lockdep_assert_held(&bat_priv->tvlv.container_list_lock);
+
hlist_for_each_entry(tvlv_tmp, &bat_priv->tvlv.container_list, list) {
if (tvlv_tmp->tvlv_hdr.type != type)
continue;
@@ -702,7 +736,7 @@ static struct batadv_tvlv_container
if (tvlv_tmp->tvlv_hdr.version != version)
continue;
- if (!atomic_inc_not_zero(&tvlv_tmp->refcount))
+ if (!kref_get_unless_zero(&tvlv_tmp->refcount))
continue;
tvlv = tvlv_tmp;
@@ -720,13 +754,15 @@ static struct batadv_tvlv_container
* Has to be called with the appropriate locks being acquired
* (tvlv.container_list_lock).
*
- * Returns size of all currently registered tvlv containers in bytes.
+ * Return: size of all currently registered tvlv containers in bytes.
*/
static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv)
{
struct batadv_tvlv_container *tvlv;
u16 tvlv_len = 0;
+ lockdep_assert_held(&bat_priv->tvlv.container_list_lock);
+
hlist_for_each_entry(tvlv, &bat_priv->tvlv.container_list, list) {
tvlv_len += sizeof(struct batadv_tvlv_hdr);
tvlv_len += ntohs(tvlv->tvlv_hdr.len);
@@ -747,7 +783,7 @@ static u16 batadv_tvlv_container_list_size(struct batadv_priv *bat_priv)
static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv,
struct batadv_tvlv_container *tvlv)
{
- lockdep_assert_held(&bat_priv->tvlv.handler_list_lock);
+ lockdep_assert_held(&bat_priv->tvlv.container_list_lock);
if (!tvlv)
return;
@@ -755,8 +791,8 @@ static void batadv_tvlv_container_remove(struct batadv_priv *bat_priv,
hlist_del(&tvlv->list);
/* first call to decrement the counter, second call to free */
- batadv_tvlv_container_free_ref(tvlv);
- batadv_tvlv_container_free_ref(tvlv);
+ batadv_tvlv_container_put(tvlv);
+ batadv_tvlv_container_put(tvlv);
}
/**
@@ -808,7 +844,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
memcpy(tvlv_new + 1, tvlv_value, ntohs(tvlv_new->tvlv_hdr.len));
INIT_HLIST_NODE(&tvlv_new->list);
- atomic_set(&tvlv_new->refcount, 1);
+ kref_init(&tvlv_new->refcount);
spin_lock_bh(&bat_priv->tvlv.container_list_lock);
tvlv_old = batadv_tvlv_container_get(bat_priv, type, version);
@@ -826,7 +862,7 @@ void batadv_tvlv_container_register(struct batadv_priv *bat_priv,
* @additional_packet_len: requested additional packet size on top of minimum
* size
*
- * Returns true of the packet buffer could be changed to the requested size,
+ * Return: true of the packet buffer could be changed to the requested size,
* false otherwise.
*/
static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff,
@@ -862,7 +898,7 @@ static bool batadv_tvlv_realloc_packet_buff(unsigned char **packet_buff,
* The ogm packet might be enlarged or shrunk depending on the current size
* and the size of the to-be-appended tvlv containers.
*
- * Returns size of all appended tvlv containers in bytes.
+ * Return: size of all appended tvlv containers in bytes.
*/
u16 batadv_tvlv_container_ogm_append(struct batadv_priv *bat_priv,
unsigned char **packet_buff,
@@ -908,14 +944,14 @@ end:
* appropriate handlers
* @bat_priv: the bat priv with all the soft interface information
* @tvlv_handler: tvlv callback function handling the tvlv content
- * @ogm_source: flag indicating wether the tvlv is an ogm or a unicast packet
+ * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet
* @orig_node: orig node emitting the ogm packet
* @src: source mac address of the unicast packet
* @dst: destination mac address of the unicast packet
* @tvlv_value: tvlv content
* @tvlv_value_len: tvlv content length
*
- * Returns success if handler was not found or the return value of the handler
+ * Return: success if handler was not found or the return value of the handler
* callback.
*/
static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv,
@@ -961,14 +997,14 @@ static int batadv_tvlv_call_handler(struct batadv_priv *bat_priv,
* batadv_tvlv_containers_process - parse the given tvlv buffer to call the
* appropriate handlers
* @bat_priv: the bat priv with all the soft interface information
- * @ogm_source: flag indicating wether the tvlv is an ogm or a unicast packet
+ * @ogm_source: flag indicating whether the tvlv is an ogm or a unicast packet
* @orig_node: orig node emitting the ogm packet
* @src: source mac address of the unicast packet
* @dst: destination mac address of the unicast packet
* @tvlv_value: tvlv content
* @tvlv_value_len: tvlv content length
*
- * Returns success when processing an OGM or the return value of all called
+ * Return: success when processing an OGM or the return value of all called
* handler callbacks.
*/
int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
@@ -1001,7 +1037,7 @@ int batadv_tvlv_containers_process(struct batadv_priv *bat_priv,
src, dst, tvlv_value,
tvlv_value_cont_len);
if (tvlv_handler)
- batadv_tvlv_handler_free_ref(tvlv_handler);
+ batadv_tvlv_handler_put(tvlv_handler);
tvlv_value = (u8 *)tvlv_value + tvlv_value_cont_len;
tvlv_value_len -= tvlv_value_cont_len;
}
@@ -1081,7 +1117,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
tvlv_handler = batadv_tvlv_handler_get(bat_priv, type, version);
if (tvlv_handler) {
- batadv_tvlv_handler_free_ref(tvlv_handler);
+ batadv_tvlv_handler_put(tvlv_handler);
return;
}
@@ -1094,7 +1130,7 @@ void batadv_tvlv_handler_register(struct batadv_priv *bat_priv,
tvlv_handler->type = type;
tvlv_handler->version = version;
tvlv_handler->flags = flags;
- atomic_set(&tvlv_handler->refcount, 1);
+ kref_init(&tvlv_handler->refcount);
INIT_HLIST_NODE(&tvlv_handler->list);
spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
@@ -1118,11 +1154,11 @@ void batadv_tvlv_handler_unregister(struct batadv_priv *bat_priv,
if (!tvlv_handler)
return;
- batadv_tvlv_handler_free_ref(tvlv_handler);
+ batadv_tvlv_handler_put(tvlv_handler);
spin_lock_bh(&bat_priv->tvlv.handler_list_lock);
hlist_del_rcu(&tvlv_handler->list);
spin_unlock_bh(&bat_priv->tvlv.handler_list_lock);
- batadv_tvlv_handler_free_ref(tvlv_handler);
+ batadv_tvlv_handler_put(tvlv_handler);
}
/**
@@ -1143,15 +1179,14 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
struct batadv_unicast_tvlv_packet *unicast_tvlv_packet;
struct batadv_tvlv_hdr *tvlv_hdr;
struct batadv_orig_node *orig_node;
- struct sk_buff *skb = NULL;
+ struct sk_buff *skb;
unsigned char *tvlv_buff;
unsigned int tvlv_len;
ssize_t hdr_len = sizeof(*unicast_tvlv_packet);
- bool ret = false;
orig_node = batadv_orig_hash_find(bat_priv, dst);
if (!orig_node)
- goto out;
+ return;
tvlv_len = sizeof(*tvlv_hdr) + tvlv_value_len;
@@ -1180,14 +1215,10 @@ void batadv_tvlv_unicast_send(struct batadv_priv *bat_priv, u8 *src,
tvlv_buff += sizeof(*tvlv_hdr);
memcpy(tvlv_buff, tvlv_value, tvlv_value_len);
- if (batadv_send_skb_to_orig(skb, orig_node, NULL) != NET_XMIT_DROP)
- ret = true;
-
-out:
- if (skb && !ret)
+ if (batadv_send_skb_to_orig(skb, orig_node, NULL) == NET_XMIT_DROP)
kfree_skb(skb);
- if (orig_node)
- batadv_orig_node_free_ref(orig_node);
+out:
+ batadv_orig_node_put(orig_node);
}
/**
@@ -1195,8 +1226,8 @@ out:
* @skb: the buffer containing the packet
* @header_len: length of the batman header preceding the ethernet header
*
- * If the packet embedded in the skb is vlan tagged this function returns the
- * VID with the BATADV_VLAN_HAS_TAG flag. Otherwise BATADV_NO_FLAGS is returned.
+ * Return: VID with the BATADV_VLAN_HAS_TAG flag when the packet embedded in the
+ * skb is vlan tagged. Otherwise BATADV_NO_FLAGS.
*/
unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len)
{
@@ -1223,7 +1254,7 @@ unsigned short batadv_get_vid(struct sk_buff *skb, size_t header_len)
* @vid: the VLAN identifier for which the AP isolation attributed as to be
* looked up
*
- * Returns true if AP isolation is on for the VLAN idenfied by vid, false
+ * Return: true if AP isolation is on for the VLAN idenfied by vid, false
* otherwise
*/
bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid)
@@ -1237,7 +1268,7 @@ bool batadv_vlan_ap_isola_get(struct batadv_priv *bat_priv, unsigned short vid)
vlan = batadv_softif_vlan_get(bat_priv, vid);
if (vlan) {
ap_isolation_enabled = atomic_read(&vlan->ap_isolation);
- batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_put(vlan);
}
return ap_isolation_enabled;
diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index ebd8af0a1eb0..db4533631834 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -24,17 +24,21 @@
#define BATADV_DRIVER_DEVICE "batman-adv"
#ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2015.2"
+#define BATADV_SOURCE_VERSION "2016.1"
#endif
/* B.A.T.M.A.N. parameters */
#define BATADV_TQ_MAX_VALUE 255
+#define BATADV_THROUGHPUT_MAX_VALUE 0xFFFFFFFF
#define BATADV_JITTER 20
/* Time To Live of broadcast messages */
#define BATADV_TTL 50
+/* maximum sequence number age of broadcast messages */
+#define BATADV_BCAST_MAX_AGE 64
+
/* purge originators after time in seconds if no valid packet comes in
* -> TODO: check influence on BATADV_TQ_LOCAL_WINDOW_SIZE
*/
@@ -57,6 +61,15 @@
#define BATADV_TQ_LOCAL_BIDRECT_RECV_MINIMUM 1
#define BATADV_TQ_TOTAL_BIDRECT_LIMIT 1
+/* B.A.T.M.A.N. V */
+#define BATADV_THROUGHPUT_DEFAULT_VALUE 10 /* 1 Mbps */
+#define BATADV_ELP_PROBES_PER_NODE 2
+#define BATADV_ELP_MIN_PROBE_SIZE 200 /* bytes */
+#define BATADV_ELP_PROBE_MAX_TX_DIFF 100 /* milliseconds */
+#define BATADV_ELP_MAX_AGE 64
+#define BATADV_OGM_MAX_ORIGDIFF 5
+#define BATADV_OGM_MAX_AGE 64
+
/* number of OGMs sent with the last tt diff */
#define BATADV_TT_OGM_APPEND_MAX 3
@@ -97,11 +110,6 @@
*/
#define BATADV_TQ_SIMILARITY_THRESHOLD 50
-/* how much worse secondary interfaces may be to be considered as bonding
- * candidates
- */
-#define BATADV_BONDING_TQ_THRESHOLD 50
-
/* should not be bigger than 512 bytes or change the size of
* forw_packet->direct_link_flags
*/
@@ -109,7 +117,7 @@
#define BATADV_MAX_AGGREGATION_MS 100
#define BATADV_BLA_PERIOD_LENGTH 10000 /* 10 seconds */
-#define BATADV_BLA_BACKBONE_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 3)
+#define BATADV_BLA_BACKBONE_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 6)
#define BATADV_BLA_CLAIM_TIMEOUT (BATADV_BLA_PERIOD_LENGTH * 10)
#define BATADV_BLA_WAIT_PERIODS 3
@@ -273,9 +281,14 @@ static inline void _batadv_dbg(int type __always_unused,
pr_err("%s: " fmt, _netdev->name, ## arg); \
} while (0)
-/* returns 1 if they are the same ethernet addr
+/**
+ * batadv_compare_eth - Compare two not u16 aligned Ethernet addresses
+ * @data1: Pointer to a six-byte array containing the Ethernet address
+ * @data2: Pointer other six-byte array containing the Ethernet address
*
* note: can't use ether_addr_equal() as it requires aligned memory
+ *
+ * Return: 1 if they are the same ethernet addr
*/
static inline bool batadv_compare_eth(const void *data1, const void *data2)
{
@@ -287,7 +300,7 @@ static inline bool batadv_compare_eth(const void *data1, const void *data2)
* @timestamp: base value to compare with (in jiffies)
* @timeout: added to base value before comparing (in milliseconds)
*
- * Returns true if current time is after timestamp + timeout
+ * Return: true if current time is after timestamp + timeout
*/
static inline bool batadv_has_timed_out(unsigned long timestamp,
unsigned int timeout)
@@ -326,7 +339,13 @@ static inline void batadv_add_counter(struct batadv_priv *bat_priv, size_t idx,
#define batadv_inc_counter(b, i) batadv_add_counter(b, i, 1)
-/* Sum and return the cpu-local counters for index 'idx' */
+/**
+ * batadv_sum_counter - Sum the cpu-local counters for index 'idx'
+ * @bat_priv: the bat priv with all the soft interface information
+ * @idx: index of counter to sum up
+ *
+ * Return: sum of all cpu-local counters
+ */
static inline u64 batadv_sum_counter(struct batadv_priv *bat_priv, size_t idx)
{
u64 *counters, sum = 0;
diff --git a/net/batman-adv/multicast.c b/net/batman-adv/multicast.c
index eb76386f8d4b..8caa2c72efa3 100644
--- a/net/batman-adv/multicast.c
+++ b/net/batman-adv/multicast.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors:
*
* Linus Lüssing
*
@@ -30,6 +30,7 @@
#include <linux/in.h>
#include <linux/ip.h>
#include <linux/ipv6.h>
+#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
@@ -55,7 +56,7 @@
* Collect multicast addresses of the local multicast listeners
* on the given soft interface, dev, in the given mcast_list.
*
- * Returns -ENOMEM on memory allocation error or the number of
+ * Return: -ENOMEM on memory allocation error or the number of
* items added to the mcast_list otherwise.
*/
static int batadv_mcast_mla_softif_get(struct net_device *dev,
@@ -87,7 +88,7 @@ static int batadv_mcast_mla_softif_get(struct net_device *dev,
* @mcast_addr: the multicast address to check
* @mcast_list: the list with multicast addresses to search in
*
- * Returns true if the given address is already in the given list.
+ * Return: true if the given address is already in the given list.
* Otherwise returns false.
*/
static bool batadv_mcast_mla_is_duplicate(u8 *mcast_addr,
@@ -195,8 +196,9 @@ static void batadv_mcast_mla_tt_add(struct batadv_priv *bat_priv,
* batadv_mcast_has_bridge - check whether the soft-iface is bridged
* @bat_priv: the bat priv with all the soft interface information
*
- * Checks whether there is a bridge on top of our soft interface. Returns
- * true if so, false otherwise.
+ * Checks whether there is a bridge on top of our soft interface.
+ *
+ * Return: true if there is a bridge, false otherwise.
*/
static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv)
{
@@ -218,7 +220,7 @@ static bool batadv_mcast_has_bridge(struct batadv_priv *bat_priv)
* Updates the own multicast tvlv with our current multicast related settings,
* capabilities and inabilities.
*
- * Returns true if the tvlv container is registered afterwards. Otherwise
+ * Return: true if the tvlv container is registered afterwards. Otherwise
* returns false.
*/
static bool batadv_mcast_mla_tvlv_update(struct batadv_priv *bat_priv)
@@ -289,8 +291,8 @@ out:
* Checks whether the given IPv4 packet has the potential to be forwarded with a
* mode more optimal than classic flooding.
*
- * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM in case of
- * memory allocation failure.
+ * Return: If so then 0. Otherwise -EINVAL or -ENOMEM in case of memory
+ * allocation failure.
*/
static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv,
struct sk_buff *skb,
@@ -327,8 +329,7 @@ static int batadv_mcast_forw_mode_check_ipv4(struct batadv_priv *bat_priv,
* Checks whether the given IPv6 packet has the potential to be forwarded with a
* mode more optimal than classic flooding.
*
- * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM if we are out
- * of memory.
+ * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory
*/
static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
struct sk_buff *skb,
@@ -366,8 +367,7 @@ static int batadv_mcast_forw_mode_check_ipv6(struct batadv_priv *bat_priv,
* Checks whether the given multicast ethernet frame has the potential to be
* forwarded with a mode more optimal than classic flooding.
*
- * If so then returns 0. Otherwise -EINVAL is returned or -ENOMEM if we are out
- * of memory.
+ * Return: If so then 0. Otherwise -EINVAL is or -ENOMEM if we are out of memory
*/
static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
struct sk_buff *skb,
@@ -398,7 +398,7 @@ static int batadv_mcast_forw_mode_check(struct batadv_priv *bat_priv,
* @bat_priv: the bat priv with all the soft interface information
* @ethhdr: ethernet header of a packet
*
- * Returns the number of nodes which want all IPv4 multicast traffic if the
+ * Return: the number of nodes which want all IPv4 multicast traffic if the
* given ethhdr is from an IPv4 packet or the number of nodes which want all
* IPv6 traffic if it matches an IPv6 packet.
*/
@@ -421,7 +421,7 @@ static int batadv_mcast_forw_want_all_ip_count(struct batadv_priv *bat_priv,
* @bat_priv: the bat priv with all the soft interface information
* @ethhdr: the ether header containing the multicast destination
*
- * Returns an orig_node matching the multicast address provided by ethhdr
+ * Return: an orig_node matching the multicast address provided by ethhdr
* via a translation table lookup. This increases the returned nodes refcount.
*/
static struct batadv_orig_node *
@@ -436,7 +436,7 @@ batadv_mcast_forw_tt_node_get(struct batadv_priv *bat_priv,
* batadv_mcast_want_forw_ipv4_node_get - get a node with an ipv4 flag
* @bat_priv: the bat priv with all the soft interface information
*
- * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and
+ * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 flag set and
* increases its refcount.
*/
static struct batadv_orig_node *
@@ -448,7 +448,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv)
hlist_for_each_entry_rcu(tmp_orig_node,
&bat_priv->mcast.want_all_ipv4_list,
mcast_want_all_ipv4_node) {
- if (!atomic_inc_not_zero(&tmp_orig_node->refcount))
+ if (!kref_get_unless_zero(&tmp_orig_node->refcount))
continue;
orig_node = tmp_orig_node;
@@ -463,7 +463,7 @@ batadv_mcast_forw_ipv4_node_get(struct batadv_priv *bat_priv)
* batadv_mcast_want_forw_ipv6_node_get - get a node with an ipv6 flag
* @bat_priv: the bat priv with all the soft interface information
*
- * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set
+ * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV6 flag set
* and increases its refcount.
*/
static struct batadv_orig_node *
@@ -475,7 +475,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv)
hlist_for_each_entry_rcu(tmp_orig_node,
&bat_priv->mcast.want_all_ipv6_list,
mcast_want_all_ipv6_node) {
- if (!atomic_inc_not_zero(&tmp_orig_node->refcount))
+ if (!kref_get_unless_zero(&tmp_orig_node->refcount))
continue;
orig_node = tmp_orig_node;
@@ -491,7 +491,7 @@ batadv_mcast_forw_ipv6_node_get(struct batadv_priv *bat_priv)
* @bat_priv: the bat priv with all the soft interface information
* @ethhdr: an ethernet header to determine the protocol family from
*
- * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or
+ * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_IPV4 or
* BATADV_MCAST_WANT_ALL_IPV6 flag, depending on the provided ethhdr, set and
* increases its refcount.
*/
@@ -514,7 +514,7 @@ batadv_mcast_forw_ip_node_get(struct batadv_priv *bat_priv,
* batadv_mcast_want_forw_unsnoop_node_get - get a node with an unsnoopable flag
* @bat_priv: the bat priv with all the soft interface information
*
- * Returns an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag
+ * Return: an orig_node which has the BATADV_MCAST_WANT_ALL_UNSNOOPABLES flag
* set and increases its refcount.
*/
static struct batadv_orig_node *
@@ -526,7 +526,7 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv)
hlist_for_each_entry_rcu(tmp_orig_node,
&bat_priv->mcast.want_all_unsnoopables_list,
mcast_want_all_unsnoopables_node) {
- if (!atomic_inc_not_zero(&tmp_orig_node->refcount))
+ if (!kref_get_unless_zero(&tmp_orig_node->refcount))
continue;
orig_node = tmp_orig_node;
@@ -543,7 +543,7 @@ batadv_mcast_forw_unsnoop_node_get(struct batadv_priv *bat_priv)
* @skb: The multicast packet to check
* @orig: an originator to be set to forward the skb to
*
- * Returns the forwarding mode as enum batadv_forw_mode and in case of
+ * Return: the forwarding mode as enum batadv_forw_mode and in case of
* BATADV_FORW_SINGLE set the orig to the single originator the skb
* should be forwarded to.
*/
@@ -802,7 +802,9 @@ void batadv_mcast_free(struct batadv_priv *bat_priv)
batadv_tvlv_container_unregister(bat_priv, BATADV_TVLV_MCAST, 1);
batadv_tvlv_handler_unregister(bat_priv, BATADV_TVLV_MCAST, 1);
+ spin_lock_bh(&bat_priv->tt.commit_lock);
batadv_mcast_mla_tt_retract(bat_priv, NULL);
+ spin_unlock_bh(&bat_priv->tt.commit_lock);
}
/**
diff --git a/net/batman-adv/multicast.h b/net/batman-adv/multicast.h
index 8f3cb04b9f13..80bceec55592 100644
--- a/net/batman-adv/multicast.h
+++ b/net/batman-adv/multicast.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2014-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2014-2016 B.A.T.M.A.N. contributors:
*
* Linus Lüssing
*
@@ -23,7 +23,7 @@
struct sk_buff;
/**
- * batadv_forw_mode - the way a packet should be forwarded as
+ * enum batadv_forw_mode - the way a packet should be forwarded as
* @BATADV_FORW_ALL: forward the packet to all nodes (currently via classic
* flooding)
* @BATADV_FORW_SINGLE: forward the packet to a single node (currently via the
diff --git a/net/batman-adv/network-coding.c b/net/batman-adv/network-coding.c
index f5276be2c77c..b41719b6487a 100644
--- a/net/batman-adv/network-coding.c
+++ b/net/batman-adv/network-coding.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll, Jeppe Ledet-Pedersen
*
@@ -32,6 +32,7 @@
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
+#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
@@ -64,6 +65,8 @@ static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
/**
* batadv_nc_init - one-time initialization for network coding
+ *
+ * Return: 0 on success or negative error number in case of failure
*/
int __init batadv_nc_init(void)
{
@@ -142,6 +145,8 @@ static void batadv_nc_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
/**
* batadv_nc_mesh_init - initialise coding hash table and start house keeping
* @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 on success or negative error number in case of failure
*/
int batadv_nc_mesh_init(struct batadv_priv *bat_priv)
{
@@ -203,39 +208,52 @@ void batadv_nc_init_orig(struct batadv_orig_node *orig_node)
}
/**
- * batadv_nc_node_free_rcu - rcu callback to free an nc node and remove
- * its refcount on the orig_node
- * @rcu: rcu pointer of the nc node
+ * batadv_nc_node_release - release nc_node from lists and queue for free after
+ * rcu grace period
+ * @ref: kref pointer of the nc_node
*/
-static void batadv_nc_node_free_rcu(struct rcu_head *rcu)
+static void batadv_nc_node_release(struct kref *ref)
{
struct batadv_nc_node *nc_node;
- nc_node = container_of(rcu, struct batadv_nc_node, rcu);
- batadv_orig_node_free_ref(nc_node->orig_node);
- kfree(nc_node);
+ nc_node = container_of(ref, struct batadv_nc_node, refcount);
+
+ batadv_orig_node_put(nc_node->orig_node);
+ kfree_rcu(nc_node, rcu);
}
/**
- * batadv_nc_node_free_ref - decrements the nc node refcounter and possibly
- * frees it
- * @nc_node: the nc node to free
+ * batadv_nc_node_put - decrement the nc_node refcounter and possibly
+ * release it
+ * @nc_node: nc_node to be free'd
*/
-static void batadv_nc_node_free_ref(struct batadv_nc_node *nc_node)
+static void batadv_nc_node_put(struct batadv_nc_node *nc_node)
{
- if (atomic_dec_and_test(&nc_node->refcount))
- call_rcu(&nc_node->rcu, batadv_nc_node_free_rcu);
+ kref_put(&nc_node->refcount, batadv_nc_node_release);
}
/**
- * batadv_nc_path_free_ref - decrements the nc path refcounter and possibly
- * frees it
- * @nc_path: the nc node to free
+ * batadv_nc_path_release - release nc_path from lists and queue for free after
+ * rcu grace period
+ * @ref: kref pointer of the nc_path
*/
-static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path)
+static void batadv_nc_path_release(struct kref *ref)
{
- if (atomic_dec_and_test(&nc_path->refcount))
- kfree_rcu(nc_path, rcu);
+ struct batadv_nc_path *nc_path;
+
+ nc_path = container_of(ref, struct batadv_nc_path, refcount);
+
+ kfree_rcu(nc_path, rcu);
+}
+
+/**
+ * batadv_nc_path_put - decrement the nc_path refcounter and possibly
+ * release it
+ * @nc_path: nc_path to be free'd
+ */
+static void batadv_nc_path_put(struct batadv_nc_path *nc_path)
+{
+ kref_put(&nc_path->refcount, batadv_nc_path_release);
}
/**
@@ -244,10 +262,8 @@ static void batadv_nc_path_free_ref(struct batadv_nc_path *nc_path)
*/
static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet)
{
- if (nc_packet->skb)
- kfree_skb(nc_packet->skb);
-
- batadv_nc_path_free_ref(nc_packet->nc_path);
+ kfree_skb(nc_packet->skb);
+ batadv_nc_path_put(nc_packet->nc_path);
kfree(nc_packet);
}
@@ -256,7 +272,7 @@ static void batadv_nc_packet_free(struct batadv_nc_packet *nc_packet)
* @bat_priv: the bat priv with all the soft interface information
* @nc_node: the nc node to check
*
- * Returns true if the entry has to be purged now, false otherwise
+ * Return: true if the entry has to be purged now, false otherwise
*/
static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv,
struct batadv_nc_node *nc_node)
@@ -272,7 +288,7 @@ static bool batadv_nc_to_purge_nc_node(struct batadv_priv *bat_priv,
* @bat_priv: the bat priv with all the soft interface information
* @nc_path: the nc path to check
*
- * Returns true if the entry has to be purged now, false otherwise
+ * Return: true if the entry has to be purged now, false otherwise
*/
static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv,
struct batadv_nc_path *nc_path)
@@ -292,7 +308,7 @@ static bool batadv_nc_to_purge_nc_path_coding(struct batadv_priv *bat_priv,
* @bat_priv: the bat priv with all the soft interface information
* @nc_path: the nc path to check
*
- * Returns true if the entry has to be purged now, false otherwise
+ * Return: true if the entry has to be purged now, false otherwise
*/
static bool batadv_nc_to_purge_nc_path_decoding(struct batadv_priv *bat_priv,
struct batadv_nc_path *nc_path)
@@ -340,7 +356,7 @@ batadv_nc_purge_orig_nc_nodes(struct batadv_priv *bat_priv,
"Removing nc_node %pM -> %pM\n",
nc_node->addr, nc_node->orig_node->orig);
list_del_rcu(&nc_node->list);
- batadv_nc_node_free_ref(nc_node);
+ batadv_nc_node_put(nc_node);
}
spin_unlock_bh(lock);
}
@@ -451,7 +467,7 @@ static void batadv_nc_purge_paths(struct batadv_priv *bat_priv,
"Remove nc_path %pM -> %pM\n",
nc_path->prev_hop, nc_path->next_hop);
hlist_del_rcu(&nc_path->hash_entry);
- batadv_nc_path_free_ref(nc_path);
+ batadv_nc_path_put(nc_path);
}
spin_unlock_bh(lock);
}
@@ -475,7 +491,7 @@ static void batadv_nc_hash_key_gen(struct batadv_nc_path *key, const char *src,
* @data: data to hash
* @size: size of the hash table
*
- * Returns the selected index in the hash table for the given data.
+ * Return: the selected index in the hash table for the given data.
*/
static u32 batadv_nc_hash_choose(const void *data, u32 size)
{
@@ -494,7 +510,7 @@ static u32 batadv_nc_hash_choose(const void *data, u32 size)
* @node: node in the local table
* @data2: second object to compare the node to
*
- * Returns 1 if the two entry are the same, 0 otherwise
+ * Return: 1 if the two entry are the same, 0 otherwise
*/
static int batadv_nc_hash_compare(const struct hlist_node *node,
const void *data2)
@@ -521,7 +537,7 @@ static int batadv_nc_hash_compare(const struct hlist_node *node,
* @hash: hash table containing the nc path
* @data: search key
*
- * Returns the nc_path if found, NULL otherwise.
+ * Return: the nc_path if found, NULL otherwise.
*/
static struct batadv_nc_path *
batadv_nc_hash_find(struct batadv_hashtable *hash,
@@ -542,7 +558,7 @@ batadv_nc_hash_find(struct batadv_hashtable *hash,
if (!batadv_nc_hash_compare(&nc_path->hash_entry, data))
continue;
- if (!atomic_inc_not_zero(&nc_path->refcount))
+ if (!kref_get_unless_zero(&nc_path->refcount))
continue;
nc_path_tmp = nc_path;
@@ -559,9 +575,7 @@ batadv_nc_hash_find(struct batadv_hashtable *hash,
*/
static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet)
{
- batadv_send_skb_packet(nc_packet->skb,
- nc_packet->neigh_node->if_incoming,
- nc_packet->nc_path->next_hop);
+ batadv_send_unicast_skb(nc_packet->skb, nc_packet->neigh_node);
nc_packet->skb = NULL;
batadv_nc_packet_free(nc_packet);
}
@@ -576,7 +590,7 @@ static void batadv_nc_send_packet(struct batadv_nc_packet *nc_packet)
* timeout. If so, the packet is no longer kept and the entry deleted from the
* queue. Has to be called with the appropriate locks.
*
- * Returns false as soon as the entry in the fifo queue has not been timed out
+ * Return: false as soon as the entry in the fifo queue has not been timed out
* yet and true otherwise.
*/
static bool batadv_nc_sniffed_purge(struct batadv_priv *bat_priv,
@@ -615,7 +629,7 @@ out:
* packet is no longer delayed, immediately sent and the entry deleted from the
* queue. Has to be called with the appropriate locks.
*
- * Returns false as soon as the entry in the fifo queue has not been timed out
+ * Return: false as soon as the entry in the fifo queue has not been timed out
* yet and true otherwise.
*/
static bool batadv_nc_fwd_flush(struct batadv_priv *bat_priv,
@@ -736,7 +750,7 @@ static void batadv_nc_worker(struct work_struct *work)
* @orig_node: neighboring orig node which may be used as nc candidate
* @ogm_packet: incoming ogm packet also used for the checks
*
- * Returns true if:
+ * Return: true if:
* 1) The OGM must have the most recent sequence number.
* 2) The TTL must be decremented by one and only one.
* 3) The OGM must be received from the first hop from orig_node.
@@ -756,7 +770,7 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv,
last_ttl = orig_ifinfo->last_ttl;
last_real_seqno = orig_ifinfo->last_real_seqno;
- batadv_orig_ifinfo_free_ref(orig_ifinfo);
+ batadv_orig_ifinfo_put(orig_ifinfo);
if (last_real_seqno != ntohl(ogm_packet->seqno))
return false;
@@ -777,7 +791,7 @@ static bool batadv_can_nc_with_orig(struct batadv_priv *bat_priv,
* (can be equal to orig_node)
* @in_coding: traverse incoming or outgoing network coding list
*
- * Returns the nc_node if found, NULL otherwise.
+ * Return: the nc_node if found, NULL otherwise.
*/
static struct batadv_nc_node
*batadv_nc_find_nc_node(struct batadv_orig_node *orig_node,
@@ -798,7 +812,7 @@ static struct batadv_nc_node
if (!batadv_compare_eth(nc_node->addr, orig_node->orig))
continue;
- if (!atomic_inc_not_zero(&nc_node->refcount))
+ if (!kref_get_unless_zero(&nc_node->refcount))
continue;
/* Found a match */
@@ -819,7 +833,7 @@ static struct batadv_nc_node
* (can be equal to orig_node)
* @in_coding: traverse incoming or outgoing network coding list
*
- * Returns the nc_node if found or created, NULL in case of an error.
+ * Return: the nc_node if found or created, NULL in case of an error.
*/
static struct batadv_nc_node
*batadv_nc_get_nc_node(struct batadv_priv *bat_priv,
@@ -842,14 +856,15 @@ static struct batadv_nc_node
if (!nc_node)
return NULL;
- if (!atomic_inc_not_zero(&orig_neigh_node->refcount))
+ if (!kref_get_unless_zero(&orig_neigh_node->refcount))
goto free;
/* Initialize nc_node */
INIT_LIST_HEAD(&nc_node->list);
ether_addr_copy(nc_node->addr, orig_node->orig);
nc_node->orig_node = orig_neigh_node;
- atomic_set(&nc_node->refcount, 2);
+ kref_init(&nc_node->refcount);
+ kref_get(&nc_node->refcount);
/* Select ingoing or outgoing coding node */
if (in_coding) {
@@ -925,9 +940,9 @@ void batadv_nc_update_nc_node(struct batadv_priv *bat_priv,
out:
if (in_nc_node)
- batadv_nc_node_free_ref(in_nc_node);
+ batadv_nc_node_put(in_nc_node);
if (out_nc_node)
- batadv_nc_node_free_ref(out_nc_node);
+ batadv_nc_node_put(out_nc_node);
}
/**
@@ -937,7 +952,7 @@ out:
* @src: ethernet source address - first half of the nc path search key
* @dst: ethernet destination address - second half of the nc path search key
*
- * Returns pointer to nc_path if the path was found or created, returns NULL
+ * Return: pointer to nc_path if the path was found or created, returns NULL
* on error.
*/
static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
@@ -968,7 +983,8 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
/* Initialize nc_path */
INIT_LIST_HEAD(&nc_path->packet_list);
spin_lock_init(&nc_path->packet_list_lock);
- atomic_set(&nc_path->refcount, 2);
+ kref_init(&nc_path->refcount);
+ kref_get(&nc_path->refcount);
nc_path->last_valid = jiffies;
ether_addr_copy(nc_path->next_hop, dst);
ether_addr_copy(nc_path->prev_hop, src);
@@ -994,6 +1010,8 @@ static struct batadv_nc_path *batadv_nc_get_path(struct batadv_priv *bat_priv,
* batadv_nc_random_weight_tq - scale the receivers TQ-value to avoid unfair
* selection of a receiver with slightly lower TQ than the other
* @tq: to be weighted tq value
+ *
+ * Return: scaled tq value
*/
static u8 batadv_nc_random_weight_tq(u8 tq)
{
@@ -1034,7 +1052,7 @@ static void batadv_nc_memxor(char *dst, const char *src, unsigned int len)
* @nc_packet: structure containing the packet to the skb can be coded with
* @neigh_node: next hop to forward packet to
*
- * Returns true if both packets are consumed, false otherwise.
+ * Return: true if both packets are consumed, false otherwise.
*/
static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
struct sk_buff *skb,
@@ -1047,11 +1065,11 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
struct batadv_unicast_packet *packet1;
struct batadv_unicast_packet *packet2;
struct batadv_coded_packet *coded_packet;
- struct batadv_neigh_node *neigh_tmp, *router_neigh;
- struct batadv_neigh_node *router_coding = NULL;
+ struct batadv_neigh_node *neigh_tmp, *router_neigh, *first_dest;
+ struct batadv_neigh_node *router_coding = NULL, *second_dest;
struct batadv_neigh_ifinfo *router_neigh_ifinfo = NULL;
struct batadv_neigh_ifinfo *router_coding_ifinfo = NULL;
- u8 *first_source, *first_dest, *second_source, *second_dest;
+ u8 *first_source, *second_source;
__be32 packet_id1, packet_id2;
size_t count;
bool res = false;
@@ -1094,9 +1112,9 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
*/
if (tq_weighted_neigh >= tq_weighted_coding) {
/* Destination from nc_packet is selected for MAC-header */
- first_dest = nc_packet->nc_path->next_hop;
+ first_dest = nc_packet->neigh_node;
first_source = nc_packet->nc_path->prev_hop;
- second_dest = neigh_node->addr;
+ second_dest = neigh_node;
second_source = ethhdr->h_source;
packet1 = (struct batadv_unicast_packet *)nc_packet->skb->data;
packet2 = (struct batadv_unicast_packet *)skb->data;
@@ -1105,9 +1123,9 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
skb->data + sizeof(*packet2));
} else {
/* Destination for skb is selected for MAC-header */
- first_dest = neigh_node->addr;
+ first_dest = neigh_node;
first_source = ethhdr->h_source;
- second_dest = nc_packet->nc_path->next_hop;
+ second_dest = nc_packet->neigh_node;
second_source = nc_packet->nc_path->prev_hop;
packet1 = (struct batadv_unicast_packet *)skb->data;
packet2 = (struct batadv_unicast_packet *)nc_packet->skb->data;
@@ -1149,7 +1167,7 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
coded_packet->first_ttvn = packet1->ttvn;
/* Info about second unicast packet */
- ether_addr_copy(coded_packet->second_dest, second_dest);
+ ether_addr_copy(coded_packet->second_dest, second_dest->addr);
ether_addr_copy(coded_packet->second_source, second_source);
ether_addr_copy(coded_packet->second_orig_dest, packet2->dest);
coded_packet->second_crc = packet_id2;
@@ -1204,17 +1222,17 @@ static bool batadv_nc_code_packets(struct batadv_priv *bat_priv,
batadv_nc_packet_free(nc_packet);
/* Send the coded packet and return true */
- batadv_send_skb_packet(skb_dest, neigh_node->if_incoming, first_dest);
+ batadv_send_unicast_skb(skb_dest, first_dest);
res = true;
out:
if (router_neigh)
- batadv_neigh_node_free_ref(router_neigh);
+ batadv_neigh_node_put(router_neigh);
if (router_coding)
- batadv_neigh_node_free_ref(router_coding);
+ batadv_neigh_node_put(router_coding);
if (router_neigh_ifinfo)
- batadv_neigh_ifinfo_free_ref(router_neigh_ifinfo);
+ batadv_neigh_ifinfo_put(router_neigh_ifinfo);
if (router_coding_ifinfo)
- batadv_neigh_ifinfo_free_ref(router_coding_ifinfo);
+ batadv_neigh_ifinfo_put(router_coding_ifinfo);
return res;
}
@@ -1233,7 +1251,7 @@ out:
* Since the source encoded the packet we can be certain it has all necessary
* decode information.
*
- * Returns true if coding of a decoded packet is allowed.
+ * Return: true if coding of a decoded packet is allowed.
*/
static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src)
{
@@ -1251,7 +1269,7 @@ static bool batadv_nc_skb_coding_possible(struct sk_buff *skb, u8 *dst, u8 *src)
* @skb: data skb to forward
* @eth_dst: next hop mac address of skb
*
- * Returns true if coding of a decoded skb is allowed.
+ * Return: true if coding of a decoded skb is allowed.
*/
static struct batadv_nc_packet *
batadv_nc_path_search(struct batadv_priv *bat_priv,
@@ -1319,7 +1337,7 @@ batadv_nc_path_search(struct batadv_priv *bat_priv,
* @eth_src: source mac address of skb
* @in_nc_node: pointer to skb next hop's neighbor nc node
*
- * Returns an nc packet if a suitable coding packet was found, NULL otherwise.
+ * Return: an nc packet if a suitable coding packet was found, NULL otherwise.
*/
static struct batadv_nc_packet *
batadv_nc_skb_src_search(struct batadv_priv *bat_priv,
@@ -1352,7 +1370,7 @@ batadv_nc_skb_src_search(struct batadv_priv *bat_priv,
}
rcu_read_unlock();
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
return nc_packet;
}
@@ -1402,7 +1420,7 @@ static void batadv_nc_skb_store_before_coding(struct batadv_priv *bat_priv,
* next hop that potentially sent a packet which our next hop also received
* (overheard) and has stored for later decoding.
*
- * Returns true if the skb was consumed (encoded packet sent) or false otherwise
+ * Return: true if the skb was consumed (encoded packet sent) or false otherwise
*/
static bool batadv_nc_skb_dst_search(struct sk_buff *skb,
struct batadv_neigh_node *neigh_node,
@@ -1456,7 +1474,7 @@ static bool batadv_nc_skb_dst_search(struct sk_buff *skb,
* @neigh_node: next hop to forward packet to
* @packet_id: checksum to identify packet
*
- * Returns true if the packet was buffered or false in case of an error.
+ * Return: true if the packet was buffered or false in case of an error.
*/
static bool batadv_nc_skb_add_to_path(struct sk_buff *skb,
struct batadv_nc_path *nc_path,
@@ -1490,7 +1508,7 @@ static bool batadv_nc_skb_add_to_path(struct sk_buff *skb,
* @skb: data skb to forward
* @neigh_node: next hop to forward packet to
*
- * Returns true if the skb was consumed (encoded packet sent) or false otherwise
+ * Return: true if the skb was consumed (encoded packet sent) or false otherwise
*/
bool batadv_nc_skb_forward(struct sk_buff *skb,
struct batadv_neigh_node *neigh_node)
@@ -1535,7 +1553,7 @@ bool batadv_nc_skb_forward(struct sk_buff *skb,
return true;
free_nc_path:
- batadv_nc_path_free_ref(nc_path);
+ batadv_nc_path_put(nc_path);
out:
/* Packet is not consumed */
return false;
@@ -1597,7 +1615,7 @@ void batadv_nc_skb_store_for_decoding(struct batadv_priv *bat_priv,
free_skb:
kfree_skb(skb);
free_nc_path:
- batadv_nc_path_free_ref(nc_path);
+ batadv_nc_path_put(nc_path);
out:
return;
}
@@ -1629,7 +1647,7 @@ void batadv_nc_skb_store_sniffed_unicast(struct batadv_priv *bat_priv,
* @skb: unicast skb to decode
* @nc_packet: decode data needed to decode the skb
*
- * Returns pointer to decoded unicast packet if the packet was decoded or NULL
+ * Return: pointer to decoded unicast packet if the packet was decoded or NULL
* in case of an error.
*/
static struct batadv_unicast_packet *
@@ -1723,7 +1741,7 @@ batadv_nc_skb_decode_packet(struct batadv_priv *bat_priv, struct sk_buff *skb,
* @ethhdr: pointer to the ethernet header inside the coded packet
* @coded: coded packet we try to find decode data for
*
- * Returns pointer to nc packet if the needed data was found or NULL otherwise.
+ * Return: pointer to nc packet if the needed data was found or NULL otherwise.
*/
static struct batadv_nc_packet *
batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv,
@@ -1786,6 +1804,9 @@ batadv_nc_find_decoding_packet(struct batadv_priv *bat_priv,
* resulting unicast packet
* @skb: incoming coded packet
* @recv_if: pointer to interface this packet was received on
+ *
+ * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
+ * otherwise.
*/
static int batadv_nc_recv_coded_packet(struct sk_buff *skb,
struct batadv_hard_iface *recv_if)
@@ -1870,6 +1891,8 @@ void batadv_nc_mesh_free(struct batadv_priv *bat_priv)
* batadv_nc_nodes_seq_print_text - print the nc node information
* @seq: seq file to print on
* @offset: not used
+ *
+ * Return: always 0
*/
int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset)
{
@@ -1925,13 +1948,15 @@ int batadv_nc_nodes_seq_print_text(struct seq_file *seq, void *offset)
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
return 0;
}
/**
* batadv_nc_init_debugfs - create nc folder and related files in debugfs
* @bat_priv: the bat priv with all the soft interface information
+ *
+ * Return: 0 on success or negative error number in case of failure
*/
int batadv_nc_init_debugfs(struct batadv_priv *bat_priv)
{
diff --git a/net/batman-adv/network-coding.h b/net/batman-adv/network-coding.h
index 8f6d4ad8778a..d6d7fb4ec5d5 100644
--- a/net/batman-adv/network-coding.h
+++ b/net/batman-adv/network-coding.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2012-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2012-2016 B.A.T.M.A.N. contributors:
*
* Martin Hundebøll, Jeppe Ledet-Pedersen
*
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 7486df9ed48d..e4cbb0753e37 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2009-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2009-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -18,11 +18,13 @@
#include "originator.h"
#include "main.h"
+#include <linux/atomic.h>
#include <linux/errno.h>
#include <linux/etherdevice.h>
#include <linux/fs.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
+#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
@@ -47,7 +49,13 @@ static struct lock_class_key batadv_orig_hash_lock_class_key;
static void batadv_purge_orig(struct work_struct *work);
-/* returns 1 if they are the same originator */
+/**
+ * batadv_compare_orig - comparing function used in the originator hash table
+ * @node: node in the local table
+ * @data2: second object to compare the node to
+ *
+ * Return: 1 if they are the same originator
+ */
int batadv_compare_orig(const struct hlist_node *node, const void *data2)
{
const void *data1 = container_of(node, struct batadv_orig_node,
@@ -61,7 +69,7 @@ int batadv_compare_orig(const struct hlist_node *node, const void *data2)
* @orig_node: the originator serving the VLAN
* @vid: the VLAN identifier
*
- * Returns the vlan object identified by vid and belonging to orig_node or NULL
+ * Return: the vlan object identified by vid and belonging to orig_node or NULL
* if it does not exist.
*/
struct batadv_orig_node_vlan *
@@ -75,7 +83,7 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node,
if (tmp->vid != vid)
continue;
- if (!atomic_inc_not_zero(&tmp->refcount))
+ if (!kref_get_unless_zero(&tmp->refcount))
continue;
vlan = tmp;
@@ -93,7 +101,7 @@ batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node,
* @orig_node: the originator serving the VLAN
* @vid: the VLAN identifier
*
- * Returns NULL in case of failure or the vlan object identified by vid and
+ * Return: NULL in case of failure or the vlan object identified by vid and
* belonging to orig_node otherwise. The object is created and added to the list
* if it does not exist.
*
@@ -116,7 +124,8 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
if (!vlan)
goto out;
- atomic_set(&vlan->refcount, 2);
+ kref_init(&vlan->refcount);
+ kref_get(&vlan->refcount);
vlan->vid = vid;
hlist_add_head_rcu(&vlan->list, &orig_node->vlan_list);
@@ -128,14 +137,27 @@ out:
}
/**
- * batadv_orig_node_vlan_free_ref - decrement the refcounter and possibly free
+ * batadv_orig_node_vlan_release - release originator-vlan object from lists
+ * and queue for free after rcu grace period
+ * @ref: kref pointer of the originator-vlan object
+ */
+static void batadv_orig_node_vlan_release(struct kref *ref)
+{
+ struct batadv_orig_node_vlan *orig_vlan;
+
+ orig_vlan = container_of(ref, struct batadv_orig_node_vlan, refcount);
+
+ kfree_rcu(orig_vlan, rcu);
+}
+
+/**
+ * batadv_orig_node_vlan_put - decrement the refcounter and possibly release
* the originator-vlan object
* @orig_vlan: the originator-vlan object to release
*/
-void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan)
+void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan)
{
- if (atomic_dec_and_test(&orig_vlan->refcount))
- kfree_rcu(orig_vlan, rcu);
+ kref_put(&orig_vlan->refcount, batadv_orig_node_vlan_release);
}
int batadv_originator_init(struct batadv_priv *bat_priv)
@@ -163,92 +185,107 @@ err:
}
/**
- * batadv_neigh_ifinfo_free_rcu - free the neigh_ifinfo object
- * @rcu: rcu pointer of the neigh_ifinfo object
+ * batadv_neigh_ifinfo_release - release neigh_ifinfo from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the neigh_ifinfo
*/
-static void batadv_neigh_ifinfo_free_rcu(struct rcu_head *rcu)
+static void batadv_neigh_ifinfo_release(struct kref *ref)
{
struct batadv_neigh_ifinfo *neigh_ifinfo;
- neigh_ifinfo = container_of(rcu, struct batadv_neigh_ifinfo, rcu);
+ neigh_ifinfo = container_of(ref, struct batadv_neigh_ifinfo, refcount);
if (neigh_ifinfo->if_outgoing != BATADV_IF_DEFAULT)
- batadv_hardif_free_ref_now(neigh_ifinfo->if_outgoing);
+ batadv_hardif_put(neigh_ifinfo->if_outgoing);
- kfree(neigh_ifinfo);
+ kfree_rcu(neigh_ifinfo, rcu);
}
/**
- * batadv_neigh_ifinfo_free_now - decrement the refcounter and possibly free
- * the neigh_ifinfo (without rcu callback)
+ * batadv_neigh_ifinfo_put - decrement the refcounter and possibly release
+ * the neigh_ifinfo
* @neigh_ifinfo: the neigh_ifinfo object to release
*/
-static void
-batadv_neigh_ifinfo_free_ref_now(struct batadv_neigh_ifinfo *neigh_ifinfo)
+void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo)
{
- if (atomic_dec_and_test(&neigh_ifinfo->refcount))
- batadv_neigh_ifinfo_free_rcu(&neigh_ifinfo->rcu);
+ kref_put(&neigh_ifinfo->refcount, batadv_neigh_ifinfo_release);
}
/**
- * batadv_neigh_ifinfo_free_ref - decrement the refcounter and possibly free
- * the neigh_ifinfo
- * @neigh_ifinfo: the neigh_ifinfo object to release
+ * batadv_hardif_neigh_release - release hardif neigh node from lists and
+ * queue for free after rcu grace period
+ * @ref: kref pointer of the neigh_node
+ */
+static void batadv_hardif_neigh_release(struct kref *ref)
+{
+ struct batadv_hardif_neigh_node *hardif_neigh;
+
+ hardif_neigh = container_of(ref, struct batadv_hardif_neigh_node,
+ refcount);
+
+ spin_lock_bh(&hardif_neigh->if_incoming->neigh_list_lock);
+ hlist_del_init_rcu(&hardif_neigh->list);
+ spin_unlock_bh(&hardif_neigh->if_incoming->neigh_list_lock);
+
+ batadv_hardif_put(hardif_neigh->if_incoming);
+ kfree_rcu(hardif_neigh, rcu);
+}
+
+/**
+ * batadv_hardif_neigh_put - decrement the hardif neighbors refcounter
+ * and possibly release it
+ * @hardif_neigh: hardif neigh neighbor to free
*/
-void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo)
+void batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh)
{
- if (atomic_dec_and_test(&neigh_ifinfo->refcount))
- call_rcu(&neigh_ifinfo->rcu, batadv_neigh_ifinfo_free_rcu);
+ kref_put(&hardif_neigh->refcount, batadv_hardif_neigh_release);
}
/**
- * batadv_neigh_node_free_rcu - free the neigh_node
- * @rcu: rcu pointer of the neigh_node
+ * batadv_neigh_node_release - release neigh_node from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the neigh_node
*/
-static void batadv_neigh_node_free_rcu(struct rcu_head *rcu)
+static void batadv_neigh_node_release(struct kref *ref)
{
struct hlist_node *node_tmp;
struct batadv_neigh_node *neigh_node;
+ struct batadv_hardif_neigh_node *hardif_neigh;
struct batadv_neigh_ifinfo *neigh_ifinfo;
struct batadv_algo_ops *bao;
- neigh_node = container_of(rcu, struct batadv_neigh_node, rcu);
+ neigh_node = container_of(ref, struct batadv_neigh_node, refcount);
bao = neigh_node->orig_node->bat_priv->bat_algo_ops;
hlist_for_each_entry_safe(neigh_ifinfo, node_tmp,
&neigh_node->ifinfo_list, list) {
- batadv_neigh_ifinfo_free_ref_now(neigh_ifinfo);
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
+ }
+
+ hardif_neigh = batadv_hardif_neigh_get(neigh_node->if_incoming,
+ neigh_node->addr);
+ if (hardif_neigh) {
+ /* batadv_hardif_neigh_get() increases refcount too */
+ batadv_hardif_neigh_put(hardif_neigh);
+ batadv_hardif_neigh_put(hardif_neigh);
}
if (bao->bat_neigh_free)
bao->bat_neigh_free(neigh_node);
- batadv_hardif_free_ref_now(neigh_node->if_incoming);
+ batadv_hardif_put(neigh_node->if_incoming);
- kfree(neigh_node);
+ kfree_rcu(neigh_node, rcu);
}
/**
- * batadv_neigh_node_free_ref_now - decrement the neighbors refcounter
- * and possibly free it (without rcu callback)
+ * batadv_neigh_node_put - decrement the neighbors refcounter and possibly
+ * release it
* @neigh_node: neigh neighbor to free
*/
-static void
-batadv_neigh_node_free_ref_now(struct batadv_neigh_node *neigh_node)
+void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node)
{
- if (atomic_dec_and_test(&neigh_node->refcount))
- batadv_neigh_node_free_rcu(&neigh_node->rcu);
-}
-
-/**
- * batadv_neigh_node_free_ref - decrement the neighbors refcounter
- * and possibly free it
- * @neigh_node: neigh neighbor to free
- */
-void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node)
-{
- if (atomic_dec_and_test(&neigh_node->refcount))
- call_rcu(&neigh_node->rcu, batadv_neigh_node_free_rcu);
+ kref_put(&neigh_node->refcount, batadv_neigh_node_release);
}
/**
@@ -257,7 +294,7 @@ void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node)
* @if_outgoing: the interface where the payload packet has been received or
* the OGM should be sent to
*
- * Returns the neighbor which should be router for this orig_node/iface.
+ * Return: the neighbor which should be router for this orig_node/iface.
*
* The object is returned with refcounter increased by 1.
*/
@@ -277,7 +314,7 @@ batadv_orig_router_get(struct batadv_orig_node *orig_node,
break;
}
- if (router && !atomic_inc_not_zero(&router->refcount))
+ if (router && !kref_get_unless_zero(&router->refcount))
router = NULL;
rcu_read_unlock();
@@ -289,7 +326,7 @@ batadv_orig_router_get(struct batadv_orig_node *orig_node,
* @orig_node: the orig node to be queried
* @if_outgoing: the interface for which the ifinfo should be acquired
*
- * Returns the requested orig_ifinfo or NULL if not found.
+ * Return: the requested orig_ifinfo or NULL if not found.
*
* The object is returned with refcounter increased by 1.
*/
@@ -305,7 +342,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node,
if (tmp->if_outgoing != if_outgoing)
continue;
- if (!atomic_inc_not_zero(&tmp->refcount))
+ if (!kref_get_unless_zero(&tmp->refcount))
continue;
orig_ifinfo = tmp;
@@ -321,7 +358,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node,
* @orig_node: the orig node to be queried
* @if_outgoing: the interface for which the ifinfo should be acquired
*
- * Returns NULL in case of failure or the orig_ifinfo object for the if_outgoing
+ * Return: NULL in case of failure or the orig_ifinfo object for the if_outgoing
* interface otherwise. The object is created and added to the list
* if it does not exist.
*
@@ -345,7 +382,7 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
goto out;
if (if_outgoing != BATADV_IF_DEFAULT &&
- !atomic_inc_not_zero(&if_outgoing->refcount)) {
+ !kref_get_unless_zero(&if_outgoing->refcount)) {
kfree(orig_ifinfo);
orig_ifinfo = NULL;
goto out;
@@ -356,7 +393,8 @@ batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
orig_ifinfo->batman_seqno_reset = reset_time;
orig_ifinfo->if_outgoing = if_outgoing;
INIT_HLIST_NODE(&orig_ifinfo->list);
- atomic_set(&orig_ifinfo->refcount, 2);
+ kref_init(&orig_ifinfo->refcount);
+ kref_get(&orig_ifinfo->refcount);
hlist_add_head_rcu(&orig_ifinfo->list,
&orig_node->ifinfo_list);
out:
@@ -366,12 +404,12 @@ out:
/**
* batadv_neigh_ifinfo_get - find the ifinfo from an neigh_node
- * @neigh_node: the neigh node to be queried
+ * @neigh: the neigh node to be queried
* @if_outgoing: the interface for which the ifinfo should be acquired
*
* The object is returned with refcounter increased by 1.
*
- * Returns the requested neigh_ifinfo or NULL if not found
+ * Return: the requested neigh_ifinfo or NULL if not found
*/
struct batadv_neigh_ifinfo *
batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
@@ -386,7 +424,7 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
if (tmp_neigh_ifinfo->if_outgoing != if_outgoing)
continue;
- if (!atomic_inc_not_zero(&tmp_neigh_ifinfo->refcount))
+ if (!kref_get_unless_zero(&tmp_neigh_ifinfo->refcount))
continue;
neigh_ifinfo = tmp_neigh_ifinfo;
@@ -399,10 +437,10 @@ batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
/**
* batadv_neigh_ifinfo_new - search and possibly create an neigh_ifinfo object
- * @neigh_node: the neigh node to be queried
+ * @neigh: the neigh node to be queried
* @if_outgoing: the interface for which the ifinfo should be acquired
*
- * Returns NULL in case of failure or the neigh_ifinfo object for the
+ * Return: NULL in case of failure or the neigh_ifinfo object for the
* if_outgoing interface otherwise. The object is created and added to the list
* if it does not exist.
*
@@ -424,14 +462,15 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh,
if (!neigh_ifinfo)
goto out;
- if (if_outgoing && !atomic_inc_not_zero(&if_outgoing->refcount)) {
+ if (if_outgoing && !kref_get_unless_zero(&if_outgoing->refcount)) {
kfree(neigh_ifinfo);
neigh_ifinfo = NULL;
goto out;
}
INIT_HLIST_NODE(&neigh_ifinfo->list);
- atomic_set(&neigh_ifinfo->refcount, 2);
+ kref_init(&neigh_ifinfo->refcount);
+ kref_get(&neigh_ifinfo->refcount);
neigh_ifinfo->if_outgoing = if_outgoing;
hlist_add_head_rcu(&neigh_ifinfo->list, &neigh->ifinfo_list);
@@ -450,7 +489,8 @@ out:
*
* Looks for and possibly returns a neighbour belonging to this originator list
* which is connected through the provided hard interface.
- * Returns NULL if the neighbour is not found.
+ *
+ * Return: neighbor when found. Othwerwise NULL
*/
static struct batadv_neigh_node *
batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
@@ -467,7 +507,7 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
if (tmp_neigh_node->if_incoming != hard_iface)
continue;
- if (!atomic_inc_not_zero(&tmp_neigh_node->refcount))
+ if (!kref_get_unless_zero(&tmp_neigh_node->refcount))
continue;
res = tmp_neigh_node;
@@ -479,13 +519,115 @@ batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
}
/**
+ * batadv_hardif_neigh_create - create a hardif neighbour node
+ * @hard_iface: the interface this neighbour is connected to
+ * @neigh_addr: the interface address of the neighbour to retrieve
+ *
+ * Return: the hardif neighbour node if found or created or NULL otherwise.
+ */
+static struct batadv_hardif_neigh_node *
+batadv_hardif_neigh_create(struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr)
+{
+ struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_hardif_neigh_node *hardif_neigh = NULL;
+
+ spin_lock_bh(&hard_iface->neigh_list_lock);
+
+ /* check if neighbor hasn't been added in the meantime */
+ hardif_neigh = batadv_hardif_neigh_get(hard_iface, neigh_addr);
+ if (hardif_neigh)
+ goto out;
+
+ if (!kref_get_unless_zero(&hard_iface->refcount))
+ goto out;
+
+ hardif_neigh = kzalloc(sizeof(*hardif_neigh), GFP_ATOMIC);
+ if (!hardif_neigh) {
+ batadv_hardif_put(hard_iface);
+ goto out;
+ }
+
+ INIT_HLIST_NODE(&hardif_neigh->list);
+ ether_addr_copy(hardif_neigh->addr, neigh_addr);
+ hardif_neigh->if_incoming = hard_iface;
+ hardif_neigh->last_seen = jiffies;
+
+ kref_init(&hardif_neigh->refcount);
+
+ if (bat_priv->bat_algo_ops->bat_hardif_neigh_init)
+ bat_priv->bat_algo_ops->bat_hardif_neigh_init(hardif_neigh);
+
+ hlist_add_head(&hardif_neigh->list, &hard_iface->neigh_list);
+
+out:
+ spin_unlock_bh(&hard_iface->neigh_list_lock);
+ return hardif_neigh;
+}
+
+/**
+ * batadv_hardif_neigh_get_or_create - retrieve or create a hardif neighbour
+ * node
+ * @hard_iface: the interface this neighbour is connected to
+ * @neigh_addr: the interface address of the neighbour to retrieve
+ *
+ * Return: the hardif neighbour node if found or created or NULL otherwise.
+ */
+static struct batadv_hardif_neigh_node *
+batadv_hardif_neigh_get_or_create(struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr)
+{
+ struct batadv_hardif_neigh_node *hardif_neigh = NULL;
+
+ /* first check without locking to avoid the overhead */
+ hardif_neigh = batadv_hardif_neigh_get(hard_iface, neigh_addr);
+ if (hardif_neigh)
+ return hardif_neigh;
+
+ return batadv_hardif_neigh_create(hard_iface, neigh_addr);
+}
+
+/**
+ * batadv_hardif_neigh_get - retrieve a hardif neighbour from the list
+ * @hard_iface: the interface where this neighbour is connected to
+ * @neigh_addr: the address of the neighbour
+ *
+ * Looks for and possibly returns a neighbour belonging to this hard interface.
+ *
+ * Return: neighbor when found. Othwerwise NULL
+ */
+struct batadv_hardif_neigh_node *
+batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr)
+{
+ struct batadv_hardif_neigh_node *tmp_hardif_neigh, *hardif_neigh = NULL;
+
+ rcu_read_lock();
+ hlist_for_each_entry_rcu(tmp_hardif_neigh,
+ &hard_iface->neigh_list, list) {
+ if (!batadv_compare_eth(tmp_hardif_neigh->addr, neigh_addr))
+ continue;
+
+ if (!kref_get_unless_zero(&tmp_hardif_neigh->refcount))
+ continue;
+
+ hardif_neigh = tmp_hardif_neigh;
+ break;
+ }
+ rcu_read_unlock();
+
+ return hardif_neigh;
+}
+
+/**
* batadv_neigh_node_new - create and init a new neigh_node object
* @orig_node: originator object representing the neighbour
* @hard_iface: the interface where the neighbour is connected to
* @neigh_addr: the mac address of the neighbour interface
*
* Allocates a new neigh_node object and initialises all the generic fields.
- * Returns the new object or NULL on failure.
+ *
+ * Return: neighbor when found. Othwerwise NULL
*/
struct batadv_neigh_node *
batadv_neigh_node_new(struct batadv_orig_node *orig_node,
@@ -493,16 +635,22 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
const u8 *neigh_addr)
{
struct batadv_neigh_node *neigh_node;
+ struct batadv_hardif_neigh_node *hardif_neigh = NULL;
neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr);
if (neigh_node)
goto out;
+ hardif_neigh = batadv_hardif_neigh_get_or_create(hard_iface,
+ neigh_addr);
+ if (!hardif_neigh)
+ goto out;
+
neigh_node = kzalloc(sizeof(*neigh_node), GFP_ATOMIC);
if (!neigh_node)
goto out;
- if (!atomic_inc_not_zero(&hard_iface->refcount)) {
+ if (!kref_get_unless_zero(&hard_iface->refcount)) {
kfree(neigh_node);
neigh_node = NULL;
goto out;
@@ -517,72 +665,127 @@ batadv_neigh_node_new(struct batadv_orig_node *orig_node,
neigh_node->orig_node = orig_node;
/* extra reference for return */
- atomic_set(&neigh_node->refcount, 2);
+ kref_init(&neigh_node->refcount);
+ kref_get(&neigh_node->refcount);
spin_lock_bh(&orig_node->neigh_list_lock);
hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
spin_unlock_bh(&orig_node->neigh_list_lock);
+ /* increment unique neighbor refcount */
+ kref_get(&hardif_neigh->refcount);
+
batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv,
"Creating new neighbor %pM for orig_node %pM on interface %s\n",
neigh_addr, orig_node->orig, hard_iface->net_dev->name);
out:
+ if (hardif_neigh)
+ batadv_hardif_neigh_put(hardif_neigh);
return neigh_node;
}
/**
- * batadv_orig_ifinfo_free_rcu - free the orig_ifinfo object
- * @rcu: rcu pointer of the orig_ifinfo object
+ * batadv_hardif_neigh_seq_print_text - print the single hop neighbour list
+ * @seq: neighbour table seq_file struct
+ * @offset: not used
+ *
+ * Return: always 0
*/
-static void batadv_orig_ifinfo_free_rcu(struct rcu_head *rcu)
+int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset)
+{
+ struct net_device *net_dev = (struct net_device *)seq->private;
+ struct batadv_priv *bat_priv = netdev_priv(net_dev);
+ struct batadv_hard_iface *primary_if;
+
+ primary_if = batadv_seq_print_text_primary_if_get(seq);
+ if (!primary_if)
+ return 0;
+
+ seq_printf(seq, "[B.A.T.M.A.N. adv %s, MainIF/MAC: %s/%pM (%s %s)]\n",
+ BATADV_SOURCE_VERSION, primary_if->net_dev->name,
+ primary_if->net_dev->dev_addr, net_dev->name,
+ bat_priv->bat_algo_ops->name);
+
+ batadv_hardif_put(primary_if);
+
+ if (!bat_priv->bat_algo_ops->bat_neigh_print) {
+ seq_puts(seq,
+ "No printing function for this routing protocol\n");
+ return 0;
+ }
+
+ bat_priv->bat_algo_ops->bat_neigh_print(bat_priv, seq);
+ return 0;
+}
+
+/**
+ * batadv_orig_ifinfo_release - release orig_ifinfo from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the orig_ifinfo
+ */
+static void batadv_orig_ifinfo_release(struct kref *ref)
{
struct batadv_orig_ifinfo *orig_ifinfo;
struct batadv_neigh_node *router;
- orig_ifinfo = container_of(rcu, struct batadv_orig_ifinfo, rcu);
+ orig_ifinfo = container_of(ref, struct batadv_orig_ifinfo, refcount);
if (orig_ifinfo->if_outgoing != BATADV_IF_DEFAULT)
- batadv_hardif_free_ref_now(orig_ifinfo->if_outgoing);
+ batadv_hardif_put(orig_ifinfo->if_outgoing);
/* this is the last reference to this object */
router = rcu_dereference_protected(orig_ifinfo->router, true);
if (router)
- batadv_neigh_node_free_ref_now(router);
- kfree(orig_ifinfo);
+ batadv_neigh_node_put(router);
+
+ kfree_rcu(orig_ifinfo, rcu);
}
/**
- * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free
- * the orig_ifinfo (without rcu callback)
+ * batadv_orig_ifinfo_put - decrement the refcounter and possibly release
+ * the orig_ifinfo
* @orig_ifinfo: the orig_ifinfo object to release
*/
-static void
-batadv_orig_ifinfo_free_ref_now(struct batadv_orig_ifinfo *orig_ifinfo)
+void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo)
{
- if (atomic_dec_and_test(&orig_ifinfo->refcount))
- batadv_orig_ifinfo_free_rcu(&orig_ifinfo->rcu);
+ kref_put(&orig_ifinfo->refcount, batadv_orig_ifinfo_release);
}
/**
- * batadv_orig_ifinfo_free_ref - decrement the refcounter and possibly free
- * the orig_ifinfo
- * @orig_ifinfo: the orig_ifinfo object to release
+ * batadv_orig_node_free_rcu - free the orig_node
+ * @rcu: rcu pointer of the orig_node
*/
-void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo)
+static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
{
- if (atomic_dec_and_test(&orig_ifinfo->refcount))
- call_rcu(&orig_ifinfo->rcu, batadv_orig_ifinfo_free_rcu);
+ struct batadv_orig_node *orig_node;
+
+ orig_node = container_of(rcu, struct batadv_orig_node, rcu);
+
+ batadv_mcast_purge_orig(orig_node);
+
+ batadv_frag_purge_orig(orig_node, NULL);
+
+ if (orig_node->bat_priv->bat_algo_ops->bat_orig_free)
+ orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node);
+
+ kfree(orig_node->tt_buff);
+ kfree(orig_node);
}
-static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
+/**
+ * batadv_orig_node_release - release orig_node from lists and queue for
+ * free after rcu grace period
+ * @ref: kref pointer of the orig_node
+ */
+static void batadv_orig_node_release(struct kref *ref)
{
struct hlist_node *node_tmp;
struct batadv_neigh_node *neigh_node;
struct batadv_orig_node *orig_node;
struct batadv_orig_ifinfo *orig_ifinfo;
- orig_node = container_of(rcu, struct batadv_orig_node, rcu);
+ orig_node = container_of(ref, struct batadv_orig_node, refcount);
spin_lock_bh(&orig_node->neigh_list_lock);
@@ -590,50 +793,30 @@ static void batadv_orig_node_free_rcu(struct rcu_head *rcu)
hlist_for_each_entry_safe(neigh_node, node_tmp,
&orig_node->neigh_list, list) {
hlist_del_rcu(&neigh_node->list);
- batadv_neigh_node_free_ref_now(neigh_node);
+ batadv_neigh_node_put(neigh_node);
}
hlist_for_each_entry_safe(orig_ifinfo, node_tmp,
&orig_node->ifinfo_list, list) {
hlist_del_rcu(&orig_ifinfo->list);
- batadv_orig_ifinfo_free_ref_now(orig_ifinfo);
+ batadv_orig_ifinfo_put(orig_ifinfo);
}
spin_unlock_bh(&orig_node->neigh_list_lock);
- batadv_mcast_purge_orig(orig_node);
-
/* Free nc_nodes */
batadv_nc_purge_orig(orig_node->bat_priv, orig_node, NULL);
- batadv_frag_purge_orig(orig_node, NULL);
-
- if (orig_node->bat_priv->bat_algo_ops->bat_orig_free)
- orig_node->bat_priv->bat_algo_ops->bat_orig_free(orig_node);
-
- kfree(orig_node->tt_buff);
- kfree(orig_node);
+ call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu);
}
/**
- * batadv_orig_node_free_ref - decrement the orig node refcounter and possibly
- * schedule an rcu callback for freeing it
+ * batadv_orig_node_put - decrement the orig node refcounter and possibly
+ * release it
* @orig_node: the orig node to free
*/
-void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node)
+void batadv_orig_node_put(struct batadv_orig_node *orig_node)
{
- if (atomic_dec_and_test(&orig_node->refcount))
- call_rcu(&orig_node->rcu, batadv_orig_node_free_rcu);
-}
-
-/**
- * batadv_orig_node_free_ref_now - decrement the orig node refcounter and
- * possibly free it (without rcu callback)
- * @orig_node: the orig node to free
- */
-void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node)
-{
- if (atomic_dec_and_test(&orig_node->refcount))
- batadv_orig_node_free_rcu(&orig_node->rcu);
+ kref_put(&orig_node->refcount, batadv_orig_node_release);
}
void batadv_originator_free(struct batadv_priv *bat_priv)
@@ -660,7 +843,7 @@ void batadv_originator_free(struct batadv_priv *bat_priv)
hlist_for_each_entry_safe(orig_node, node_tmp,
head, hash_entry) {
hlist_del_rcu(&orig_node->hash_entry);
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
}
spin_unlock_bh(list_lock);
}
@@ -675,7 +858,8 @@ void batadv_originator_free(struct batadv_priv *bat_priv)
*
* Creates a new originator object and initialise all the generic fields.
* The new object is not added to the originator list.
- * Returns the newly created object or NULL on failure.
+ *
+ * Return: the newly created object or NULL on failure.
*/
struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
const u8 *addr)
@@ -704,7 +888,8 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
batadv_nc_init_orig(orig_node);
/* extra reference for return */
- atomic_set(&orig_node->refcount, 2);
+ kref_init(&orig_node->refcount);
+ kref_get(&orig_node->refcount);
orig_node->bat_priv = bat_priv;
ether_addr_copy(orig_node->orig, addr);
@@ -732,7 +917,7 @@ struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
* Immediately release vlan since it is not needed anymore in this
* context
*/
- batadv_orig_node_vlan_free_ref(vlan);
+ batadv_orig_node_vlan_put(vlan);
for (i = 0; i < BATADV_FRAG_BUFFER_COUNT; i++) {
INIT_HLIST_HEAD(&orig_node->fragments[i].head);
@@ -781,7 +966,7 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv,
neigh->addr, if_outgoing->net_dev->name);
hlist_del_rcu(&neigh_ifinfo->list);
- batadv_neigh_ifinfo_free_ref(neigh_ifinfo);
+ batadv_neigh_ifinfo_put(neigh_ifinfo);
}
spin_unlock_bh(&neigh->ifinfo_lock);
@@ -792,7 +977,7 @@ batadv_purge_neigh_ifinfo(struct batadv_priv *bat_priv,
* @bat_priv: the bat priv with all the soft interface information
* @orig_node: orig node which is to be checked
*
- * Returns true if any ifinfo entry was purged, false otherwise.
+ * Return: true if any ifinfo entry was purged, false otherwise.
*/
static bool
batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv,
@@ -827,10 +1012,10 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv,
ifinfo_purged = true;
hlist_del_rcu(&orig_ifinfo->list);
- batadv_orig_ifinfo_free_ref(orig_ifinfo);
+ batadv_orig_ifinfo_put(orig_ifinfo);
if (orig_node->last_bonding_candidate == orig_ifinfo) {
orig_node->last_bonding_candidate = NULL;
- batadv_orig_ifinfo_free_ref(orig_ifinfo);
+ batadv_orig_ifinfo_put(orig_ifinfo);
}
}
@@ -844,7 +1029,7 @@ batadv_purge_orig_ifinfo(struct batadv_priv *bat_priv,
* @bat_priv: the bat priv with all the soft interface information
* @orig_node: orig node which is to be checked
*
- * Returns true if any neighbor was purged, false otherwise
+ * Return: true if any neighbor was purged, false otherwise
*/
static bool
batadv_purge_orig_neighbors(struct batadv_priv *bat_priv,
@@ -884,7 +1069,7 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv,
neigh_purged = true;
hlist_del_rcu(&neigh_node->list);
- batadv_neigh_node_free_ref(neigh_node);
+ batadv_neigh_node_put(neigh_node);
} else {
/* only necessary if not the whole neighbor is to be
* deleted, but some interface has been removed.
@@ -903,7 +1088,7 @@ batadv_purge_orig_neighbors(struct batadv_priv *bat_priv,
* @orig_node: orig node which is to be checked
* @if_outgoing: the interface for which the metric should be compared
*
- * Returns the current best neighbor, with refcount increased.
+ * Return: the current best neighbor, with refcount increased.
*/
static struct batadv_neigh_node *
batadv_find_best_neighbor(struct batadv_priv *bat_priv,
@@ -919,11 +1104,11 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv,
best, if_outgoing) <= 0))
continue;
- if (!atomic_inc_not_zero(&neigh->refcount))
+ if (!kref_get_unless_zero(&neigh->refcount))
continue;
if (best)
- batadv_neigh_node_free_ref(best);
+ batadv_neigh_node_put(best);
best = neigh;
}
@@ -940,7 +1125,7 @@ batadv_find_best_neighbor(struct batadv_priv *bat_priv,
* This function checks if the orig_node or substructures of it have become
* obsolete, and purges this information if that's the case.
*
- * Returns true if the orig_node is to be removed, false otherwise.
+ * Return: true if the orig_node is to be removed, false otherwise.
*/
static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node)
@@ -969,7 +1154,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
batadv_update_route(bat_priv, orig_node, BATADV_IF_DEFAULT,
best_neigh_node);
if (best_neigh_node)
- batadv_neigh_node_free_ref(best_neigh_node);
+ batadv_neigh_node_put(best_neigh_node);
/* ... then for all other interfaces. */
rcu_read_lock();
@@ -986,7 +1171,7 @@ static bool batadv_purge_orig_node(struct batadv_priv *bat_priv,
batadv_update_route(bat_priv, orig_node, hard_iface,
best_neigh_node);
if (best_neigh_node)
- batadv_neigh_node_free_ref(best_neigh_node);
+ batadv_neigh_node_put(best_neigh_node);
}
rcu_read_unlock();
@@ -1019,7 +1204,7 @@ static void _batadv_purge_orig(struct batadv_priv *bat_priv)
batadv_tt_global_del_orig(orig_node->bat_priv,
orig_node, -1,
"originator timed out");
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
continue;
}
@@ -1065,7 +1250,7 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset)
primary_if->net_dev->dev_addr, net_dev->name,
bat_priv->bat_algo_ops->name);
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
if (!bat_priv->bat_algo_ops->bat_orig_print) {
seq_puts(seq,
@@ -1085,7 +1270,7 @@ int batadv_orig_seq_print_text(struct seq_file *seq, void *offset)
* @seq: debugfs table seq_file struct
* @offset: not used
*
- * Returns 0
+ * Return: 0
*/
int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset)
{
@@ -1121,7 +1306,7 @@ int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset)
out:
if (hard_iface)
- batadv_hardif_free_ref(hard_iface);
+ batadv_hardif_put(hard_iface);
return 0;
}
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index fa18f9bf266b..4e8b67f11051 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -20,10 +20,10 @@
#include "main.h"
-#include <linux/atomic.h>
#include <linux/compiler.h>
#include <linux/if_ether.h>
#include <linux/jhash.h>
+#include <linux/kref.h>
#include <linux/rculist.h>
#include <linux/rcupdate.h>
#include <linux/stddef.h>
@@ -37,15 +37,19 @@ int batadv_compare_orig(const struct hlist_node *node, const void *data2);
int batadv_originator_init(struct batadv_priv *bat_priv);
void batadv_originator_free(struct batadv_priv *bat_priv);
void batadv_purge_orig_ref(struct batadv_priv *bat_priv);
-void batadv_orig_node_free_ref(struct batadv_orig_node *orig_node);
-void batadv_orig_node_free_ref_now(struct batadv_orig_node *orig_node);
+void batadv_orig_node_put(struct batadv_orig_node *orig_node);
struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
const u8 *addr);
+struct batadv_hardif_neigh_node *
+batadv_hardif_neigh_get(const struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr);
+void
+batadv_hardif_neigh_put(struct batadv_hardif_neigh_node *hardif_neigh);
struct batadv_neigh_node *
batadv_neigh_node_new(struct batadv_orig_node *orig_node,
struct batadv_hard_iface *hard_iface,
const u8 *neigh_addr);
-void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node);
+void batadv_neigh_node_put(struct batadv_neigh_node *neigh_node);
struct batadv_neigh_node *
batadv_orig_router_get(struct batadv_orig_node *orig_node,
const struct batadv_hard_iface *if_outgoing);
@@ -55,7 +59,9 @@ batadv_neigh_ifinfo_new(struct batadv_neigh_node *neigh,
struct batadv_neigh_ifinfo *
batadv_neigh_ifinfo_get(struct batadv_neigh_node *neigh,
struct batadv_hard_iface *if_outgoing);
-void batadv_neigh_ifinfo_free_ref(struct batadv_neigh_ifinfo *neigh_ifinfo);
+void batadv_neigh_ifinfo_put(struct batadv_neigh_ifinfo *neigh_ifinfo);
+
+int batadv_hardif_neigh_seq_print_text(struct seq_file *seq, void *offset);
struct batadv_orig_ifinfo *
batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node,
@@ -63,7 +69,7 @@ batadv_orig_ifinfo_get(struct batadv_orig_node *orig_node,
struct batadv_orig_ifinfo *
batadv_orig_ifinfo_new(struct batadv_orig_node *orig_node,
struct batadv_hard_iface *if_outgoing);
-void batadv_orig_ifinfo_free_ref(struct batadv_orig_ifinfo *orig_ifinfo);
+void batadv_orig_ifinfo_put(struct batadv_orig_ifinfo *orig_ifinfo);
int batadv_orig_seq_print_text(struct seq_file *seq, void *offset);
int batadv_orig_hardif_seq_print_text(struct seq_file *seq, void *offset);
@@ -77,7 +83,7 @@ batadv_orig_node_vlan_new(struct batadv_orig_node *orig_node,
struct batadv_orig_node_vlan *
batadv_orig_node_vlan_get(struct batadv_orig_node *orig_node,
unsigned short vid);
-void batadv_orig_node_vlan_free_ref(struct batadv_orig_node_vlan *orig_vlan);
+void batadv_orig_node_vlan_put(struct batadv_orig_node_vlan *orig_vlan);
/* hashfunction to choose an entry in a hash table of given size
* hash algorithm from http://en.wikipedia.org/wiki/Hash_table
@@ -109,7 +115,7 @@ batadv_orig_hash_find(struct batadv_priv *bat_priv, const void *data)
if (!batadv_compare_eth(orig_node, data))
continue;
- if (!atomic_inc_not_zero(&orig_node->refcount))
+ if (!kref_get_unless_zero(&orig_node->refcount))
continue;
orig_node_tmp = orig_node;
diff --git a/net/batman-adv/packet.h b/net/batman-adv/packet.h
index 11f996b39fef..8a8d7ca1a5cf 100644
--- a/net/batman-adv/packet.h
+++ b/net/batman-adv/packet.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -26,6 +26,8 @@
* @BATADV_IV_OGM: originator messages for B.A.T.M.A.N. IV
* @BATADV_BCAST: broadcast packets carrying broadcast payload
* @BATADV_CODED: network coded packets
+ * @BATADV_ELP: echo location packets for B.A.T.M.A.N. V
+ * @BATADV_OGM2: originator messages for B.A.T.M.A.N. V
*
* @BATADV_UNICAST: unicast packets carrying unicast payload traffic
* @BATADV_UNICAST_FRAG: unicast packets carrying a fragment of the original
@@ -40,6 +42,8 @@ enum batadv_packettype {
BATADV_IV_OGM = 0x00,
BATADV_BCAST = 0x01,
BATADV_CODED = 0x02,
+ BATADV_ELP = 0x03,
+ BATADV_OGM2 = 0x04,
/* 0x40 - 0x7f: unicast */
#define BATADV_UNICAST_MIN 0x40
BATADV_UNICAST = 0x40,
@@ -72,8 +76,7 @@ enum batadv_subtype {
* enum batadv_iv_flags - flags used in B.A.T.M.A.N. IV OGM packets
* @BATADV_NOT_BEST_NEXT_HOP: flag is set when ogm packet is forwarded and was
* previously received from someone else than the best neighbor.
- * @BATADV_PRIMARIES_FIRST_HOP: flag is set when the primary interface address
- * is used, and the packet travels its first hop.
+ * @BATADV_PRIMARIES_FIRST_HOP: flag unused.
* @BATADV_DIRECTLINK: flag is for the first hop or if rebroadcasted from a
* one hop neighbor on the interface where it was originally received.
*/
@@ -159,7 +162,7 @@ enum batadv_tt_client_flags {
};
/**
- * batadv_vlan_flags - flags for the four MSB of any vlan ID field
+ * enum batadv_vlan_flags - flags for the four MSB of any vlan ID field
* @BATADV_VLAN_HAS_TAG: whether the field contains a valid vlan tag or not
*/
enum batadv_vlan_flags {
@@ -210,6 +213,11 @@ struct batadv_bla_claim_dst {
* @version: batman-adv protocol version, part of the genereal header
* @ttl: time to live for this packet, part of the genereal header
* @flags: contains routing relevant flags - see enum batadv_iv_flags
+ * @seqno: sequence identification
+ * @orig: address of the source node
+ * @prev_sender: address of the previous sender
+ * @reserved: reserved byte for alignment
+ * @tq: transmission quality
* @tvlv_len: length of tvlv data following the ogm header
*/
struct batadv_ogm_packet {
@@ -231,7 +239,52 @@ struct batadv_ogm_packet {
#define BATADV_OGM_HLEN sizeof(struct batadv_ogm_packet)
/**
- * batadv_icmp_header - common members among all the ICMP packets
+ * struct batadv_ogm2_packet - ogm2 (routing protocol) packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the general header
+ * @ttl: time to live for this packet, part of the general header
+ * @flags: reseved for routing relevant flags - currently always 0
+ * @seqno: sequence number
+ * @orig: originator mac address
+ * @tvlv_len: length of the appended tvlv buffer (in bytes)
+ * @throughput: the currently flooded path throughput
+ */
+struct batadv_ogm2_packet {
+ u8 packet_type;
+ u8 version;
+ u8 ttl;
+ u8 flags;
+ __be32 seqno;
+ u8 orig[ETH_ALEN];
+ __be16 tvlv_len;
+ __be32 throughput;
+ /* __packed is not needed as the struct size is divisible by 4,
+ * and the largest data type in this struct has a size of 4.
+ */
+};
+
+#define BATADV_OGM2_HLEN sizeof(struct batadv_ogm2_packet)
+
+/**
+ * struct batadv_elp_packet - elp (neighbor discovery) packet
+ * @packet_type: batman-adv packet type, part of the general header
+ * @version: batman-adv protocol version, part of the genereal header
+ * @orig: originator mac address
+ * @seqno: sequence number
+ * @elp_interval: currently used ELP sending interval in ms
+ */
+struct batadv_elp_packet {
+ u8 packet_type;
+ u8 version;
+ u8 orig[ETH_ALEN];
+ __be32 seqno;
+ __be32 elp_interval;
+};
+
+#define BATADV_ELP_HLEN sizeof(struct batadv_elp_packet)
+
+/**
+ * struct batadv_icmp_header - common members among all the ICMP packets
* @packet_type: batman-adv packet type, part of the general header
* @version: batman-adv protocol version, part of the genereal header
* @ttl: time to live for this packet, part of the genereal header
@@ -257,7 +310,7 @@ struct batadv_icmp_header {
};
/**
- * batadv_icmp_packet - ICMP packet
+ * struct batadv_icmp_packet - ICMP packet
* @packet_type: batman-adv packet type, part of the general header
* @version: batman-adv protocol version, part of the genereal header
* @ttl: time to live for this packet, part of the genereal header
@@ -283,7 +336,7 @@ struct batadv_icmp_packet {
#define BATADV_RR_LEN 16
/**
- * batadv_icmp_packet_rr - ICMP RouteRecord packet
+ * struct batadv_icmp_packet_rr - ICMP RouteRecord packet
* @packet_type: batman-adv packet type, part of the general header
* @version: batman-adv protocol version, part of the genereal header
* @ttl: time to live for this packet, part of the genereal header
@@ -346,6 +399,7 @@ struct batadv_unicast_packet {
* @u: common unicast packet header
* @src: address of the source
* @subtype: packet subtype
+ * @reserved: reserved byte for alignment
*/
struct batadv_unicast_4addr_packet {
struct batadv_unicast_packet u;
@@ -414,7 +468,6 @@ struct batadv_bcast_packet {
* @packet_type: batman-adv packet type, part of the general header
* @version: batman-adv protocol version, part of the genereal header
* @ttl: time to live for this packet, part of the genereal header
- * @reserved: Align following fields to 2-byte boundaries
* @first_source: original source of first included packet
* @first_orig_dest: original destinal of first included packet
* @first_crc: checksum of first included packet
@@ -496,7 +549,7 @@ struct batadv_tvlv_gateway_data {
* struct batadv_tvlv_tt_data - tt data propagated through the tt tvlv container
* @flags: translation table flags (see batadv_tt_data_flags)
* @ttvn: translation table version number
- * @vlan_num: number of announced VLANs. In the TVLV this struct is followed by
+ * @num_vlan: number of announced VLANs. In the TVLV this struct is followed by
* one batadv_tvlv_tt_vlan_data object per announced vlan
*/
struct batadv_tvlv_tt_data {
diff --git a/net/batman-adv/routing.c b/net/batman-adv/routing.c
index 8d990b070a2e..4dd646a52f1a 100644
--- a/net/batman-adv/routing.c
+++ b/net/batman-adv/routing.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -25,6 +25,7 @@
#include <linux/etherdevice.h>
#include <linux/if_ether.h>
#include <linux/jiffies.h>
+#include <linux/kref.h>
#include <linux/netdevice.h>
#include <linux/printk.h>
#include <linux/rculist.h>
@@ -72,7 +73,7 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
rcu_read_lock();
curr_router = rcu_dereference(orig_ifinfo->router);
- if (curr_router && !atomic_inc_not_zero(&curr_router->refcount))
+ if (curr_router && !kref_get_unless_zero(&curr_router->refcount))
curr_router = NULL;
rcu_read_unlock();
@@ -97,20 +98,20 @@ static void _batadv_update_route(struct batadv_priv *bat_priv,
}
if (curr_router)
- batadv_neigh_node_free_ref(curr_router);
+ batadv_neigh_node_put(curr_router);
/* increase refcount of new best neighbor */
- if (neigh_node && !atomic_inc_not_zero(&neigh_node->refcount))
+ if (neigh_node && !kref_get_unless_zero(&neigh_node->refcount))
neigh_node = NULL;
spin_lock_bh(&orig_node->neigh_list_lock);
rcu_assign_pointer(orig_ifinfo->router, neigh_node);
spin_unlock_bh(&orig_node->neigh_list_lock);
- batadv_orig_ifinfo_free_ref(orig_ifinfo);
+ batadv_orig_ifinfo_put(orig_ifinfo);
/* decrease refcount of previous best neighbor */
if (curr_router)
- batadv_neigh_node_free_ref(curr_router);
+ batadv_neigh_node_put(curr_router);
}
/**
@@ -137,24 +138,38 @@ void batadv_update_route(struct batadv_priv *bat_priv,
out:
if (router)
- batadv_neigh_node_free_ref(router);
+ batadv_neigh_node_put(router);
}
-/* checks whether the host restarted and is in the protection time.
- * returns:
- * 0 if the packet is to be accepted
+/**
+ * batadv_window_protected - checks whether the host restarted and is in the
+ * protection time.
+ * @bat_priv: the bat priv with all the soft interface information
+ * @seq_num_diff: difference between the current/received sequence number and
+ * the last sequence number
+ * @seq_old_max_diff: maximum age of sequence number not considered as restart
+ * @last_reset: jiffies timestamp of the last reset, will be updated when reset
+ * is detected
+ * @protection_started: is set to true if the protection window was started,
+ * doesn't change otherwise.
+ *
+ * Return:
+ * 0 if the packet is to be accepted.
* 1 if the packet is to be ignored.
*/
int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff,
- unsigned long *last_reset)
+ s32 seq_old_max_diff, unsigned long *last_reset,
+ bool *protection_started)
{
- if (seq_num_diff <= -BATADV_TQ_LOCAL_WINDOW_SIZE ||
+ if (seq_num_diff <= -seq_old_max_diff ||
seq_num_diff >= BATADV_EXPECTED_SEQNO_RANGE) {
if (!batadv_has_timed_out(*last_reset,
BATADV_RESET_PROTECTION_MS))
return 1;
*last_reset = jiffies;
+ if (protection_started)
+ *protection_started = true;
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
"old packet received, start protection\n");
}
@@ -198,7 +213,7 @@ bool batadv_check_management_packet(struct sk_buff *skb,
* @bat_priv: the bat priv with all the soft interface information
* @skb: icmp packet to process
*
- * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
+ * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
* otherwise.
*/
static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
@@ -254,9 +269,9 @@ static int batadv_recv_my_icmp_packet(struct batadv_priv *bat_priv,
}
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
if (orig_node)
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
return ret;
}
@@ -302,9 +317,9 @@ static int batadv_recv_icmp_ttl_exceeded(struct batadv_priv *bat_priv,
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
if (orig_node)
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
return ret;
}
@@ -388,7 +403,7 @@ int batadv_recv_icmp_packet(struct sk_buff *skb,
out:
if (orig_node)
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
return ret;
}
@@ -398,10 +413,11 @@ out:
* @skb: packet to check
* @hdr_size: size of header to pull
*
- * Check for short header and bad addresses in given packet. Returns negative
- * value when check fails and 0 otherwise. The negative value depends on the
- * reason: -ENODATA for bad header, -EBADR for broadcast destination or source,
- * and -EREMOTE for non-local (other host) destination.
+ * Check for short header and bad addresses in given packet.
+ *
+ * Return: negative value when check fails and 0 otherwise. The negative value
+ * depends on the reason: -ENODATA for bad header, -EBADR for broadcast
+ * destination or source, and -EREMOTE for non-local (other host) destination.
*/
static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
struct sk_buff *skb, int hdr_size)
@@ -435,7 +451,7 @@ static int batadv_check_unicast_packet(struct batadv_priv *bat_priv,
* @orig_node: the destination node
* @recv_if: pointer to interface this packet was received on
*
- * Returns the router which should be used for this orig_node on
+ * Return: the router which should be used for this orig_node on
* this interface, or NULL if not available.
*/
struct batadv_neigh_node *
@@ -482,14 +498,14 @@ batadv_find_router(struct batadv_priv *bat_priv,
hlist_for_each_entry_rcu(cand, &orig_node->ifinfo_list, list) {
/* acquire some structures and references ... */
- if (!atomic_inc_not_zero(&cand->refcount))
+ if (!kref_get_unless_zero(&cand->refcount))
continue;
cand_router = rcu_dereference(cand->router);
if (!cand_router)
goto next;
- if (!atomic_inc_not_zero(&cand_router->refcount)) {
+ if (!kref_get_unless_zero(&cand_router->refcount)) {
cand_router = NULL;
goto next;
}
@@ -497,9 +513,9 @@ batadv_find_router(struct batadv_priv *bat_priv,
/* alternative candidate should be good enough to be
* considered
*/
- if (!bao->bat_neigh_is_equiv_or_better(cand_router,
- cand->if_outgoing,
- router, recv_if))
+ if (!bao->bat_neigh_is_similar_or_better(cand_router,
+ cand->if_outgoing,
+ router, recv_if))
goto next;
/* don't use the same router twice */
@@ -508,8 +524,8 @@ batadv_find_router(struct batadv_priv *bat_priv,
/* mark the first possible candidate */
if (!first_candidate) {
- atomic_inc(&cand_router->refcount);
- atomic_inc(&cand->refcount);
+ kref_get(&cand_router->refcount);
+ kref_get(&cand->refcount);
first_candidate = cand;
first_candidate_router = cand_router;
}
@@ -529,16 +545,16 @@ batadv_find_router(struct batadv_priv *bat_priv,
next:
/* free references */
if (cand_router) {
- batadv_neigh_node_free_ref(cand_router);
+ batadv_neigh_node_put(cand_router);
cand_router = NULL;
}
- batadv_orig_ifinfo_free_ref(cand);
+ batadv_orig_ifinfo_put(cand);
}
rcu_read_unlock();
/* last_bonding_candidate is reset below, remove the old reference. */
if (orig_node->last_bonding_candidate)
- batadv_orig_ifinfo_free_ref(orig_node->last_bonding_candidate);
+ batadv_orig_ifinfo_put(orig_node->last_bonding_candidate);
/* After finding candidates, handle the three cases:
* 1) there is a next candidate, use that
@@ -546,17 +562,17 @@ next:
* 3) there is no candidate at all, return the default router
*/
if (next_candidate) {
- batadv_neigh_node_free_ref(router);
+ batadv_neigh_node_put(router);
/* remove references to first candidate, we don't need it. */
if (first_candidate) {
- batadv_neigh_node_free_ref(first_candidate_router);
- batadv_orig_ifinfo_free_ref(first_candidate);
+ batadv_neigh_node_put(first_candidate_router);
+ batadv_orig_ifinfo_put(first_candidate);
}
router = next_candidate_router;
orig_node->last_bonding_candidate = next_candidate;
} else if (first_candidate) {
- batadv_neigh_node_free_ref(router);
+ batadv_neigh_node_put(router);
/* refcounting has already been done in the loop above. */
router = first_candidate_router;
@@ -633,7 +649,7 @@ static int batadv_route_unicast_packet(struct sk_buff *skb,
out:
if (orig_node)
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
return ret;
}
@@ -648,7 +664,7 @@ out:
* the new corresponding information (originator address where the destination
* client currently is and its known TTVN)
*
- * Returns true if the packet header has been updated, false otherwise
+ * Return: true if the packet header has been updated, false otherwise
*/
static bool
batadv_reroute_unicast_packet(struct batadv_priv *bat_priv,
@@ -686,9 +702,9 @@ batadv_reroute_unicast_packet(struct batadv_priv *bat_priv,
ret = true;
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
if (orig_node)
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
return ret;
}
@@ -752,7 +768,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
return 0;
curr_ttvn = (u8)atomic_read(&orig_node->last_ttvn);
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
}
/* check if the TTVN contained in the packet is fresher than what the
@@ -792,7 +808,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
ether_addr_copy(unicast_packet->dest, primary_if->net_dev->dev_addr);
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
unicast_packet->ttvn = curr_ttvn;
@@ -805,7 +821,7 @@ static int batadv_check_unicast_ttvn(struct batadv_priv *bat_priv,
* @skb: unicast tvlv packet to process
* @recv_if: pointer to interface this packet was received on
*
- * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
+ * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
* otherwise.
*/
int batadv_recv_unhandled_unicast_packet(struct sk_buff *skb,
@@ -836,6 +852,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
u8 *orig_addr;
struct batadv_orig_node *orig_node = NULL;
int check, hdr_size = sizeof(*unicast_packet);
+ enum batadv_subtype subtype;
bool is4addr;
unicast_packet = (struct batadv_unicast_packet *)skb->data;
@@ -863,10 +880,20 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
/* packet for me */
if (batadv_is_my_mac(bat_priv, unicast_packet->dest)) {
if (is4addr) {
- batadv_dat_inc_counter(bat_priv,
- unicast_4addr_packet->subtype);
- orig_addr = unicast_4addr_packet->src;
- orig_node = batadv_orig_hash_find(bat_priv, orig_addr);
+ subtype = unicast_4addr_packet->subtype;
+ batadv_dat_inc_counter(bat_priv, subtype);
+
+ /* Only payload data should be considered for speedy
+ * join. For example, DAT also uses unicast 4addr
+ * types, but those packets should not be considered
+ * for speedy join, since the clients do not actually
+ * reside at the sending originator.
+ */
+ if (subtype == BATADV_P_DATA) {
+ orig_addr = unicast_4addr_packet->src;
+ orig_node = batadv_orig_hash_find(bat_priv,
+ orig_addr);
+ }
}
if (batadv_dat_snoop_incoming_arp_request(bat_priv, skb,
@@ -881,7 +908,7 @@ int batadv_recv_unicast_packet(struct sk_buff *skb,
rx_success:
if (orig_node)
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
return NET_RX_SUCCESS;
}
@@ -893,9 +920,8 @@ rx_success:
* batadv_recv_unicast_tvlv - receive and process unicast tvlv packets
* @skb: unicast tvlv packet to process
* @recv_if: pointer to interface this packet was received on
- * @dst_addr: the payload destination
*
- * Returns NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
+ * Return: NET_RX_SUCCESS if the packet has been consumed or NET_RX_DROP
* otherwise.
*/
int batadv_recv_unicast_tvlv(struct sk_buff *skb,
@@ -949,7 +975,7 @@ int batadv_recv_unicast_tvlv(struct sk_buff *skb,
* the assembled packet will exceed our MTU; 2) Buffer fragment, if we till
* lack further fragments; 3) Merge fragments, if we have all needed parts.
*
- * Return NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise.
+ * Return: NET_RX_DROP if the skb is not consumed, NET_RX_SUCCESS otherwise.
*/
int batadv_recv_frag_packet(struct sk_buff *skb,
struct batadv_hard_iface *recv_if)
@@ -993,7 +1019,7 @@ int batadv_recv_frag_packet(struct sk_buff *skb,
out:
if (orig_node_src)
- batadv_orig_node_free_ref(orig_node_src);
+ batadv_orig_node_put(orig_node_src);
return ret;
}
@@ -1054,7 +1080,8 @@ int batadv_recv_bcast_packet(struct sk_buff *skb,
/* check whether the packet is old and the host just restarted. */
if (batadv_window_protected(bat_priv, seq_diff,
- &orig_node->bcast_seqno_reset))
+ BATADV_BCAST_MAX_AGE,
+ &orig_node->bcast_seqno_reset, NULL))
goto spin_unlock;
/* mark broadcast in flood history, update window position
@@ -1097,6 +1124,6 @@ spin_unlock:
spin_unlock_bh(&orig_node->bcast_seqno_lock);
out:
if (orig_node)
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
return ret;
}
diff --git a/net/batman-adv/routing.h b/net/batman-adv/routing.h
index 204bbe4952a6..02a5caa84127 100644
--- a/net/batman-adv/routing.h
+++ b/net/batman-adv/routing.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -52,6 +52,7 @@ batadv_find_router(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
struct batadv_hard_iface *recv_if);
int batadv_window_protected(struct batadv_priv *bat_priv, s32 seq_num_diff,
- unsigned long *last_reset);
+ s32 seq_old_max_diff, unsigned long *last_reset,
+ bool *protection_started);
#endif /* _NET_BATMAN_ADV_ROUTING_H_ */
diff --git a/net/batman-adv/send.c b/net/batman-adv/send.c
index f664324805eb..3ce06e0a91b1 100644
--- a/net/batman-adv/send.c
+++ b/net/batman-adv/send.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -49,16 +49,30 @@
static void batadv_send_outstanding_bcast_packet(struct work_struct *work);
-/* send out an already prepared packet to the given address via the
- * specified batman interface
+/**
+ * batadv_send_skb_packet - send an already prepared packet
+ * @skb: the packet to send
+ * @hard_iface: the interface to use to send the broadcast packet
+ * @dst_addr: the payload destination
+ *
+ * Send out an already prepared packet to the given neighbor or broadcast it
+ * using the specified interface. Either hard_iface or neigh_node must be not
+ * NULL.
+ * If neigh_node is NULL, then the packet is broadcasted using hard_iface,
+ * otherwise it is sent as unicast to the given neighbor.
+ *
+ * Return: NET_TX_DROP in case of error or the result of dev_queue_xmit(skb)
+ * otherwise
*/
int batadv_send_skb_packet(struct sk_buff *skb,
struct batadv_hard_iface *hard_iface,
const u8 *dst_addr)
{
- struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
+ struct batadv_priv *bat_priv;
struct ethhdr *ethhdr;
+ bat_priv = netdev_priv(hard_iface->soft_iface);
+
if (hard_iface->if_status != BATADV_IF_ACTIVE)
goto send_skb_err;
@@ -100,6 +114,35 @@ send_skb_err:
return NET_XMIT_DROP;
}
+int batadv_send_broadcast_skb(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface)
+{
+ return batadv_send_skb_packet(skb, hard_iface, batadv_broadcast_addr);
+}
+
+int batadv_send_unicast_skb(struct sk_buff *skb,
+ struct batadv_neigh_node *neigh)
+{
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+ struct batadv_hardif_neigh_node *hardif_neigh;
+#endif
+ int ret;
+
+ ret = batadv_send_skb_packet(skb, neigh->if_incoming, neigh->addr);
+
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+ hardif_neigh = batadv_hardif_neigh_get(neigh->if_incoming, neigh->addr);
+
+ if ((hardif_neigh) && (ret != NET_XMIT_DROP))
+ hardif_neigh->bat_v.last_unicast_tx = jiffies;
+
+ if (hardif_neigh)
+ batadv_hardif_neigh_put(hardif_neigh);
+#endif
+
+ return ret;
+}
+
/**
* batadv_send_skb_to_orig - Lookup next-hop and transmit skb.
* @skb: Packet to be transmitted.
@@ -111,7 +154,7 @@ send_skb_err:
* host, NULL can be passed as recv_if and no interface alternating is
* attempted.
*
- * Returns NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or
+ * Return: NET_XMIT_SUCCESS on success, NET_XMIT_DROP on failure, or
* NET_XMIT_POLICED if the skb is buffered for later transmit.
*/
int batadv_send_skb_to_orig(struct sk_buff *skb,
@@ -146,14 +189,13 @@ int batadv_send_skb_to_orig(struct sk_buff *skb,
if (recv_if && batadv_nc_skb_forward(skb, neigh_node)) {
ret = NET_XMIT_POLICED;
} else {
- batadv_send_skb_packet(skb, neigh_node->if_incoming,
- neigh_node->addr);
+ batadv_send_unicast_skb(skb, neigh_node);
ret = NET_XMIT_SUCCESS;
}
out:
if (neigh_node)
- batadv_neigh_node_free_ref(neigh_node);
+ batadv_neigh_node_put(neigh_node);
return ret;
}
@@ -165,7 +207,7 @@ out:
* @hdr_size: amount of bytes to push at the beginning of the skb
* @orig_node: the destination node
*
- * Returns false if the buffer extension was not possible or true otherwise.
+ * Return: false if the buffer extension was not possible or true otherwise.
*/
static bool
batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size,
@@ -196,7 +238,7 @@ batadv_send_skb_push_fill_unicast(struct sk_buff *skb, int hdr_size,
* @skb: the skb containing the payload to encapsulate
* @orig_node: the destination node
*
- * Returns false if the payload could not be encapsulated or true otherwise.
+ * Return: false if the payload could not be encapsulated or true otherwise.
*/
static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb,
struct batadv_orig_node *orig_node)
@@ -211,10 +253,10 @@ static bool batadv_send_skb_prepare_unicast(struct sk_buff *skb,
* unicast 4addr header
* @bat_priv: the bat priv with all the soft interface information
* @skb: the skb containing the payload to encapsulate
- * @orig_node: the destination node
+ * @orig: the destination node
* @packet_subtype: the unicast 4addr packet subtype to use
*
- * Returns false if the payload could not be encapsulated or true otherwise.
+ * Return: false if the payload could not be encapsulated or true otherwise.
*/
bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv,
struct sk_buff *skb,
@@ -246,7 +288,7 @@ bool batadv_send_skb_prepare_unicast_4addr(struct batadv_priv *bat_priv,
ret = true;
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
return ret;
}
@@ -265,7 +307,7 @@ out:
* as packet_type. Then send this frame to the given orig_node and release a
* reference to this orig_node.
*
- * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
*/
int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
struct sk_buff *skb, int packet_type,
@@ -317,7 +359,7 @@ int batadv_send_skb_unicast(struct batadv_priv *bat_priv,
out:
if (orig_node)
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
if (ret == NET_XMIT_DROP)
kfree_skb(skb);
return ret;
@@ -339,7 +381,7 @@ out:
* BATADV_UNICAST_4ADDR was supplied as packet_type. Then send this frame
* to the according destination node.
*
- * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
*/
int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
struct sk_buff *skb, int packet_type,
@@ -373,7 +415,7 @@ int batadv_send_skb_via_tt_generic(struct batadv_priv *bat_priv,
* Look up the currently selected gateway. Wrap the given skb into a batman-adv
* unicast header and send this frame to this gateway node.
*
- * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
*/
int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
unsigned short vid)
@@ -407,12 +449,11 @@ void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface)
static void batadv_forw_packet_free(struct batadv_forw_packet *forw_packet)
{
- if (forw_packet->skb)
- kfree_skb(forw_packet->skb);
+ kfree_skb(forw_packet->skb);
if (forw_packet->if_incoming)
- batadv_hardif_free_ref(forw_packet->if_incoming);
+ batadv_hardif_put(forw_packet->if_incoming);
if (forw_packet->if_outgoing)
- batadv_hardif_free_ref(forw_packet->if_outgoing);
+ batadv_hardif_put(forw_packet->if_outgoing);
kfree(forw_packet);
}
@@ -431,14 +472,19 @@ _batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
send_time);
}
-/* add a broadcast packet to the queue and setup timers. broadcast packets
- * are sent multiple times to increase probability for being received.
+/**
+ * batadv_add_bcast_packet_to_list - queue broadcast packet for multiple sends
+ * @bat_priv: the bat priv with all the soft interface information
+ * @skb: broadcast packet to add
+ * @delay: number of jiffies to wait before sending
*
- * This function returns NETDEV_TX_OK on success and NETDEV_TX_BUSY on
- * errors.
+ * add a broadcast packet to the queue and setup timers. broadcast packets
+ * are sent multiple times to increase probability for being received.
*
* The skb is not consumed, so the caller should make sure that the
* skb is freed.
+ *
+ * Return: NETDEV_TX_OK on success and NETDEV_TX_BUSY on errors.
*/
int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
const struct sk_buff *skb,
@@ -493,7 +539,7 @@ out_and_inc:
atomic_inc(&bat_priv->bcast_queue_left);
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
return NETDEV_TX_BUSY;
}
@@ -534,8 +580,7 @@ static void batadv_send_outstanding_bcast_packet(struct work_struct *work)
/* send a copy of the saved skb */
skb1 = skb_clone(forw_packet->skb, GFP_ATOMIC);
if (skb1)
- batadv_send_skb_packet(skb1, hard_iface,
- batadv_broadcast_addr);
+ batadv_send_broadcast_skb(skb1, hard_iface);
}
rcu_read_unlock();
diff --git a/net/batman-adv/send.h b/net/batman-adv/send.h
index 82059f259e46..6fd7270d8ce6 100644
--- a/net/batman-adv/send.h
+++ b/net/batman-adv/send.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -28,12 +28,16 @@
struct sk_buff;
struct work_struct;
-int batadv_send_skb_packet(struct sk_buff *skb,
- struct batadv_hard_iface *hard_iface,
- const u8 *dst_addr);
int batadv_send_skb_to_orig(struct sk_buff *skb,
struct batadv_orig_node *orig_node,
struct batadv_hard_iface *recv_if);
+int batadv_send_skb_packet(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface,
+ const u8 *dst_addr);
+int batadv_send_broadcast_skb(struct sk_buff *skb,
+ struct batadv_hard_iface *hard_iface);
+int batadv_send_unicast_skb(struct sk_buff *skb,
+ struct batadv_neigh_node *neigh_node);
void batadv_schedule_bat_ogm(struct batadv_hard_iface *hard_iface);
int batadv_add_bcast_packet_to_list(struct batadv_priv *bat_priv,
const struct sk_buff *skb,
@@ -69,7 +73,7 @@ int batadv_send_skb_via_gw(struct batadv_priv *bat_priv, struct sk_buff *skb,
* header via the translation table. Wrap the given skb into a batman-adv
* unicast header. Then send this frame to the according destination node.
*
- * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
*/
static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv,
struct sk_buff *skb, u8 *dst_hint,
@@ -92,7 +96,7 @@ static inline int batadv_send_skb_via_tt(struct batadv_priv *bat_priv,
* unicast-4addr header. Then send this frame to the according destination
* node.
*
- * Returns NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
+ * Return: NET_XMIT_DROP in case of error or NET_XMIT_SUCCESS otherwise.
*/
static inline int batadv_send_skb_via_tt_4addr(struct batadv_priv *bat_priv,
struct sk_buff *skb,
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index ac4d08de5df4..0710379491bf 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -30,6 +30,7 @@
#include <linux/if_vlan.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
+#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
@@ -376,7 +377,7 @@ dropped_freed:
batadv_inc_counter(bat_priv, BATADV_CNT_TX_DROPPED);
end:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
return NETDEV_TX_OK;
}
@@ -478,22 +479,34 @@ out:
}
/**
- * batadv_softif_vlan_free_ref - decrease the vlan object refcounter and
- * possibly free it
- * @softif_vlan: the vlan object to release
+ * batadv_softif_vlan_release - release vlan from lists and queue for free after
+ * rcu grace period
+ * @ref: kref pointer of the vlan object
*/
-void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan)
+static void batadv_softif_vlan_release(struct kref *ref)
+{
+ struct batadv_softif_vlan *vlan;
+
+ vlan = container_of(ref, struct batadv_softif_vlan, refcount);
+
+ spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock);
+ hlist_del_rcu(&vlan->list);
+ spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock);
+
+ kfree_rcu(vlan, rcu);
+}
+
+/**
+ * batadv_softif_vlan_put - decrease the vlan object refcounter and
+ * possibly release it
+ * @vlan: the vlan object to release
+ */
+void batadv_softif_vlan_put(struct batadv_softif_vlan *vlan)
{
if (!vlan)
return;
- if (atomic_dec_and_test(&vlan->refcount)) {
- spin_lock_bh(&vlan->bat_priv->softif_vlan_list_lock);
- hlist_del_rcu(&vlan->list);
- spin_unlock_bh(&vlan->bat_priv->softif_vlan_list_lock);
-
- kfree_rcu(vlan, rcu);
- }
+ kref_put(&vlan->refcount, batadv_softif_vlan_release);
}
/**
@@ -501,7 +514,7 @@ void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *vlan)
* @bat_priv: the bat priv with all the soft interface information
* @vid: the identifier of the vlan object to retrieve
*
- * Returns the private data of the vlan matching the vid passed as argument or
+ * Return: the private data of the vlan matching the vid passed as argument or
* NULL otherwise. The refcounter of the returned object is incremented by 1.
*/
struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
@@ -514,7 +527,7 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
if (vlan_tmp->vid != vid)
continue;
- if (!atomic_inc_not_zero(&vlan_tmp->refcount))
+ if (!kref_get_unless_zero(&vlan_tmp->refcount))
continue;
vlan = vlan_tmp;
@@ -530,7 +543,7 @@ struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
* @bat_priv: the bat priv with all the soft interface information
* @vid: the VLAN identifier
*
- * Returns 0 on success, a negative error otherwise.
+ * Return: 0 on success, a negative error otherwise.
*/
int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
{
@@ -539,7 +552,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
vlan = batadv_softif_vlan_get(bat_priv, vid);
if (vlan) {
- batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_put(vlan);
return -EEXIST;
}
@@ -549,7 +562,7 @@ int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid)
vlan->bat_priv = bat_priv;
vlan->vid = vid;
- atomic_set(&vlan->refcount, 1);
+ kref_init(&vlan->refcount);
atomic_set(&vlan->ap_isolation, 0);
@@ -588,18 +601,19 @@ static void batadv_softif_destroy_vlan(struct batadv_priv *bat_priv,
vlan->vid, "vlan interface destroyed", false);
batadv_sysfs_del_vlan(bat_priv, vlan);
- batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_put(vlan);
}
/**
* batadv_interface_add_vid - ndo_add_vid API implementation
* @dev: the netdev of the mesh interface
+ * @proto: protocol of the the vlan id
* @vid: identifier of the new vlan
*
* Set up all the internal structures for handling the new vlan on top of the
* mesh interface
*
- * Returns 0 on success or a negative error code in case of failure.
+ * Return: 0 on success or a negative error code in case of failure.
*/
static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
unsigned short vid)
@@ -632,7 +646,7 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
if (!vlan->kobj) {
ret = batadv_sysfs_add_vlan(bat_priv->soft_iface, vlan);
if (ret) {
- batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_put(vlan);
return ret;
}
}
@@ -651,12 +665,13 @@ static int batadv_interface_add_vid(struct net_device *dev, __be16 proto,
/**
* batadv_interface_kill_vid - ndo_kill_vid API implementation
* @dev: the netdev of the mesh interface
+ * @proto: protocol of the the vlan id
* @vid: identifier of the deleted vlan
*
* Destroy all the internal structures used to handle the vlan identified by vid
* on top of the mesh interface
*
- * Returns 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q
+ * Return: 0 on success, -EINVAL if the specified prototype is not ETH_P_8021Q
* or -ENOENT if the specified vlan id wasn't registered.
*/
static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto,
@@ -678,7 +693,7 @@ static int batadv_interface_kill_vid(struct net_device *dev, __be16 proto,
batadv_softif_destroy_vlan(bat_priv, vlan);
/* finally free the vlan object */
- batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_put(vlan);
return 0;
}
@@ -734,7 +749,7 @@ static void batadv_softif_destroy_finish(struct work_struct *work)
vlan = batadv_softif_vlan_get(bat_priv, BATADV_NO_FLAGS);
if (vlan) {
batadv_softif_destroy_vlan(bat_priv, vlan);
- batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_put(vlan);
}
batadv_sysfs_del_meshif(soft_iface);
@@ -745,7 +760,7 @@ static void batadv_softif_destroy_finish(struct work_struct *work)
* batadv_softif_init_late - late stage initialization of soft interface
* @dev: registered network device to modify
*
- * Returns error code on failures
+ * Return: error code on failures
*/
static int batadv_softif_init_late(struct net_device *dev)
{
@@ -847,7 +862,7 @@ free_bat_counters:
* @dev: batadv_soft_interface used as master interface
* @slave_dev: net_device which should become the slave interface
*
- * Return 0 if successful or error otherwise.
+ * Return: 0 if successful or error otherwise.
*/
static int batadv_softif_slave_add(struct net_device *dev,
struct net_device *slave_dev)
@@ -863,7 +878,7 @@ static int batadv_softif_slave_add(struct net_device *dev,
out:
if (hard_iface)
- batadv_hardif_free_ref(hard_iface);
+ batadv_hardif_put(hard_iface);
return ret;
}
@@ -872,7 +887,7 @@ out:
* @dev: batadv_soft_interface used as master interface
* @slave_dev: net_device which should be removed from the master interface
*
- * Return 0 if successful or error otherwise.
+ * Return: 0 if successful or error otherwise.
*/
static int batadv_softif_slave_del(struct net_device *dev,
struct net_device *slave_dev)
@@ -890,7 +905,7 @@ static int batadv_softif_slave_del(struct net_device *dev,
out:
if (hard_iface)
- batadv_hardif_free_ref(hard_iface);
+ batadv_hardif_put(hard_iface);
return ret;
}
diff --git a/net/batman-adv/soft-interface.h b/net/batman-adv/soft-interface.h
index 8e82176f40b1..9ae265703d23 100644
--- a/net/batman-adv/soft-interface.h
+++ b/net/batman-adv/soft-interface.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -34,7 +34,7 @@ void batadv_softif_destroy_sysfs(struct net_device *soft_iface);
int batadv_softif_is_valid(const struct net_device *net_dev);
extern struct rtnl_link_ops batadv_link_ops;
int batadv_softif_create_vlan(struct batadv_priv *bat_priv, unsigned short vid);
-void batadv_softif_vlan_free_ref(struct batadv_softif_vlan *softif_vlan);
+void batadv_softif_vlan_put(struct batadv_softif_vlan *softif_vlan);
struct batadv_softif_vlan *batadv_softif_vlan_get(struct batadv_priv *bat_priv,
unsigned short vid);
diff --git a/net/batman-adv/sysfs.c b/net/batman-adv/sysfs.c
index 9de3c8804ff4..e7cf51333a36 100644
--- a/net/batman-adv/sysfs.c
+++ b/net/batman-adv/sysfs.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
@@ -25,6 +25,7 @@
#include <linux/fs.h>
#include <linux/if.h>
#include <linux/if_vlan.h>
+#include <linux/kref.h>
#include <linux/kernel.h>
#include <linux/netdevice.h>
#include <linux/printk.h>
@@ -40,6 +41,7 @@
#include "distributed-arp-table.h"
#include "gateway_client.h"
#include "gateway_common.h"
+#include "bridge_loop_avoidance.h"
#include "hard-interface.h"
#include "network-coding.h"
#include "packet.h"
@@ -63,7 +65,7 @@ static struct batadv_priv *batadv_kobj_to_batpriv(struct kobject *obj)
* batadv_vlan_kobj_to_batpriv - convert a vlan kobj in the associated batpriv
* @obj: kobject to covert
*
- * Returns the associated batadv_priv struct.
+ * Return: the associated batadv_priv struct.
*/
static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj)
{
@@ -81,9 +83,10 @@ static struct batadv_priv *batadv_vlan_kobj_to_batpriv(struct kobject *obj)
/**
* batadv_kobj_to_vlan - convert a kobj in the associated softif_vlan struct
+ * @bat_priv: the bat priv with all the soft interface information
* @obj: kobject to covert
*
- * Returns the associated softif_vlan struct if found, NULL otherwise.
+ * Return: the associated softif_vlan struct if found, NULL otherwise.
*/
static struct batadv_softif_vlan *
batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj)
@@ -95,7 +98,7 @@ batadv_kobj_to_vlan(struct batadv_priv *bat_priv, struct kobject *obj)
if (vlan_tmp->kobj != obj)
continue;
- if (!atomic_inc_not_zero(&vlan_tmp->refcount))
+ if (!kref_get_unless_zero(&vlan_tmp->refcount))
continue;
vlan = vlan_tmp;
@@ -213,7 +216,7 @@ ssize_t batadv_store_vlan_##_name(struct kobject *kobj, \
attr, &vlan->_name, \
bat_priv->soft_iface); \
\
- batadv_softif_vlan_free_ref(vlan); \
+ batadv_softif_vlan_put(vlan); \
return res; \
}
@@ -228,7 +231,7 @@ ssize_t batadv_show_vlan_##_name(struct kobject *kobj, \
atomic_read(&vlan->_name) == 0 ? \
"disabled" : "enabled"); \
\
- batadv_softif_vlan_free_ref(vlan); \
+ batadv_softif_vlan_put(vlan); \
return res; \
}
@@ -239,12 +242,64 @@ ssize_t batadv_show_vlan_##_name(struct kobject *kobj, \
static BATADV_ATTR_VLAN(_name, _mode, batadv_show_vlan_##_name, \
batadv_store_vlan_##_name)
+#define BATADV_ATTR_HIF_STORE_UINT(_name, _var, _min, _max, _post_func) \
+ssize_t batadv_store_##_name(struct kobject *kobj, \
+ struct attribute *attr, char *buff, \
+ size_t count) \
+{ \
+ struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \
+ struct batadv_hard_iface *hard_iface; \
+ ssize_t length; \
+ \
+ hard_iface = batadv_hardif_get_by_netdev(net_dev); \
+ if (!hard_iface) \
+ return 0; \
+ \
+ length = __batadv_store_uint_attr(buff, count, _min, _max, \
+ _post_func, attr, \
+ &hard_iface->_var, net_dev); \
+ \
+ batadv_hardif_put(hard_iface); \
+ return length; \
+}
+
+#define BATADV_ATTR_HIF_SHOW_UINT(_name, _var) \
+ssize_t batadv_show_##_name(struct kobject *kobj, \
+ struct attribute *attr, char *buff) \
+{ \
+ struct net_device *net_dev = batadv_kobj_to_netdev(kobj); \
+ struct batadv_hard_iface *hard_iface; \
+ ssize_t length; \
+ \
+ hard_iface = batadv_hardif_get_by_netdev(net_dev); \
+ if (!hard_iface) \
+ return 0; \
+ \
+ length = sprintf(buff, "%i\n", atomic_read(&hard_iface->_var)); \
+ \
+ batadv_hardif_put(hard_iface); \
+ return length; \
+}
+
+/* Use this, if you are going to set [name] in hard_iface to an
+ * unsigned integer value
+ */
+#define BATADV_ATTR_HIF_UINT(_name, _var, _mode, _min, _max, _post_func)\
+ static BATADV_ATTR_HIF_STORE_UINT(_name, _var, _min, \
+ _max, _post_func) \
+ static BATADV_ATTR_HIF_SHOW_UINT(_name, _var) \
+ static BATADV_ATTR(_name, _mode, batadv_show_##_name, \
+ batadv_store_##_name)
+
static int batadv_store_bool_attr(char *buff, size_t count,
struct net_device *net_dev,
- const char *attr_name, atomic_t *attr)
+ const char *attr_name, atomic_t *attr,
+ bool *changed)
{
int enabled = -1;
+ *changed = false;
+
if (buff[count - 1] == '\n')
buff[count - 1] = '\0';
@@ -271,6 +326,8 @@ static int batadv_store_bool_attr(char *buff, size_t count,
atomic_read(attr) == 1 ? "enabled" : "disabled",
enabled == 1 ? "enabled" : "disabled");
+ *changed = true;
+
atomic_set(attr, (unsigned int)enabled);
return count;
}
@@ -281,11 +338,12 @@ __batadv_store_bool_attr(char *buff, size_t count,
struct attribute *attr,
atomic_t *attr_store, struct net_device *net_dev)
{
+ bool changed;
int ret;
ret = batadv_store_bool_attr(buff, count, net_dev, attr->name,
- attr_store);
- if (post_func && ret)
+ attr_store, &changed);
+ if (post_func && changed)
post_func(net_dev);
return ret;
@@ -484,7 +542,7 @@ static ssize_t batadv_store_gw_bwidth(struct kobject *kobj,
* @attr: the batman-adv attribute the user is interacting with
* @buff: the buffer that will contain the data to send back to the user
*
- * Returns the number of bytes written into 'buff' on success or a negative
+ * Return: the number of bytes written into 'buff' on success or a negative
* error code in case of failure
*/
static ssize_t batadv_show_isolation_mark(struct kobject *kobj,
@@ -504,7 +562,7 @@ static ssize_t batadv_show_isolation_mark(struct kobject *kobj,
* @buff: the buffer containing the user data
* @count: number of bytes in the buffer
*
- * Returns 'count' on success or a negative error code in case of failure
+ * Return: 'count' on success or a negative error code in case of failure
*/
static ssize_t batadv_store_isolation_mark(struct kobject *kobj,
struct attribute *attr, char *buff,
@@ -549,7 +607,8 @@ static ssize_t batadv_store_isolation_mark(struct kobject *kobj,
BATADV_ATTR_SIF_BOOL(aggregated_ogms, S_IRUGO | S_IWUSR, NULL);
BATADV_ATTR_SIF_BOOL(bonding, S_IRUGO | S_IWUSR, NULL);
#ifdef CONFIG_BATMAN_ADV_BLA
-BATADV_ATTR_SIF_BOOL(bridge_loop_avoidance, S_IRUGO | S_IWUSR, NULL);
+BATADV_ATTR_SIF_BOOL(bridge_loop_avoidance, S_IRUGO | S_IWUSR,
+ batadv_bla_status_update);
#endif
#ifdef CONFIG_BATMAN_ADV_DAT
BATADV_ATTR_SIF_BOOL(distributed_arp_table, S_IRUGO | S_IWUSR,
@@ -612,9 +671,7 @@ static struct batadv_attribute *batadv_mesh_attrs[] = {
BATADV_ATTR_VLAN_BOOL(ap_isolation, S_IRUGO | S_IWUSR, NULL);
-/**
- * batadv_vlan_attrs - array of vlan specific sysfs attributes
- */
+/* array of vlan specific sysfs attributes */
static struct batadv_attribute *batadv_vlan_attrs[] = {
&batadv_attr_vlan_ap_isolation,
NULL,
@@ -675,7 +732,7 @@ void batadv_sysfs_del_meshif(struct net_device *dev)
* @dev: netdev of the mesh interface
* @vlan: private data of the newly added VLAN interface
*
- * Returns 0 on success and -ENOMEM if any of the structure allocations fails.
+ * Return: 0 on success and -ENOMEM if any of the structure allocations fails.
*/
int batadv_sysfs_add_vlan(struct net_device *dev,
struct batadv_softif_vlan *vlan)
@@ -763,7 +820,7 @@ static ssize_t batadv_show_mesh_iface(struct kobject *kobj,
length = sprintf(buff, "%s\n", ifname);
- batadv_hardif_free_ref(hard_iface);
+ batadv_hardif_put(hard_iface);
return length;
}
@@ -787,7 +844,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
if (strlen(buff) >= IFNAMSIZ) {
pr_err("Invalid parameter for 'mesh_iface' setting received: interface name too long '%s'\n",
buff);
- batadv_hardif_free_ref(hard_iface);
+ batadv_hardif_put(hard_iface);
return -EINVAL;
}
@@ -821,7 +878,7 @@ static ssize_t batadv_store_mesh_iface(struct kobject *kobj,
unlock:
rtnl_unlock();
out:
- batadv_hardif_free_ref(hard_iface);
+ batadv_hardif_put(hard_iface);
return ret;
}
@@ -855,18 +912,99 @@ static ssize_t batadv_show_iface_status(struct kobject *kobj,
break;
}
- batadv_hardif_free_ref(hard_iface);
+ batadv_hardif_put(hard_iface);
return length;
}
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+
+/**
+ * batadv_store_throughput_override - parse and store throughput override
+ * entered by the user
+ * @kobj: kobject representing the private mesh sysfs directory
+ * @attr: the batman-adv attribute the user is interacting with
+ * @buff: the buffer containing the user data
+ * @count: number of bytes in the buffer
+ *
+ * Return: 'count' on success or a negative error code in case of failure
+ */
+static ssize_t batadv_store_throughput_override(struct kobject *kobj,
+ struct attribute *attr,
+ char *buff, size_t count)
+{
+ struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
+ struct batadv_hard_iface *hard_iface;
+ u32 tp_override;
+ u32 old_tp_override;
+ bool ret;
+
+ hard_iface = batadv_hardif_get_by_netdev(net_dev);
+ if (!hard_iface)
+ return -EINVAL;
+
+ if (buff[count - 1] == '\n')
+ buff[count - 1] = '\0';
+
+ ret = batadv_parse_throughput(net_dev, buff, "throughput_override",
+ &tp_override);
+ if (!ret)
+ return count;
+
+ old_tp_override = atomic_read(&hard_iface->bat_v.throughput_override);
+ if (old_tp_override == tp_override)
+ goto out;
+
+ batadv_info(net_dev, "%s: Changing from: %u.%u MBit to: %u.%u MBit\n",
+ "throughput_override",
+ old_tp_override / 10, old_tp_override % 10,
+ tp_override / 10, tp_override % 10);
+
+ atomic_set(&hard_iface->bat_v.throughput_override, tp_override);
+
+out:
+ batadv_hardif_put(hard_iface);
+ return count;
+}
+
+static ssize_t batadv_show_throughput_override(struct kobject *kobj,
+ struct attribute *attr,
+ char *buff)
+{
+ struct net_device *net_dev = batadv_kobj_to_netdev(kobj);
+ struct batadv_hard_iface *hard_iface;
+ u32 tp_override;
+
+ hard_iface = batadv_hardif_get_by_netdev(net_dev);
+ if (!hard_iface)
+ return -EINVAL;
+
+ tp_override = atomic_read(&hard_iface->bat_v.throughput_override);
+
+ return sprintf(buff, "%u.%u MBit\n", tp_override / 10,
+ tp_override % 10);
+}
+
+#endif
+
static BATADV_ATTR(mesh_iface, S_IRUGO | S_IWUSR, batadv_show_mesh_iface,
batadv_store_mesh_iface);
static BATADV_ATTR(iface_status, S_IRUGO, batadv_show_iface_status, NULL);
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+BATADV_ATTR_HIF_UINT(elp_interval, bat_v.elp_interval, S_IRUGO | S_IWUSR,
+ 2 * BATADV_JITTER, INT_MAX, NULL);
+static BATADV_ATTR(throughput_override, S_IRUGO | S_IWUSR,
+ batadv_show_throughput_override,
+ batadv_store_throughput_override);
+#endif
static struct batadv_attribute *batadv_batman_attrs[] = {
&batadv_attr_mesh_iface,
&batadv_attr_iface_status,
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+ &batadv_attr_elp_interval,
+ &batadv_attr_throughput_override,
+#endif
NULL,
};
diff --git a/net/batman-adv/sysfs.h b/net/batman-adv/sysfs.h
index 61974428a7af..c76021b4e198 100644
--- a/net/batman-adv/sysfs.h
+++ b/net/batman-adv/sysfs.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2010-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2010-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner
*
diff --git a/net/batman-adv/translation-table.c b/net/batman-adv/translation-table.c
index 4228b10c47ea..0b43e86328a5 100644
--- a/net/batman-adv/translation-table.c
+++ b/net/batman-adv/translation-table.c
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich, Antonio Quartulli
*
@@ -31,6 +31,7 @@
#include <linux/jhash.h>
#include <linux/jiffies.h>
#include <linux/kernel.h>
+#include <linux/kref.h>
#include <linux/list.h>
#include <linux/lockdep.h>
#include <linux/netdevice.h>
@@ -68,13 +69,23 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv,
unsigned short vid, const char *message,
bool roaming);
-/* returns 1 if they are the same mac addr */
+/**
+ * batadv_compare_tt - check if two TT entries are the same
+ * @node: the list element pointer of the first TT entry
+ * @data2: pointer to the tt_common_entry of the second TT entry
+ *
+ * Compare the MAC address and the VLAN ID of the two TT entries and check if
+ * they are the same TT client.
+ * Return: 1 if the two TT clients are the same, 0 otherwise
+ */
static int batadv_compare_tt(const struct hlist_node *node, const void *data2)
{
const void *data1 = container_of(node, struct batadv_tt_common_entry,
hash_entry);
+ const struct batadv_tt_common_entry *tt1 = data1;
+ const struct batadv_tt_common_entry *tt2 = data2;
- return batadv_compare_eth(data1, data2);
+ return (tt1->vid == tt2->vid) && batadv_compare_eth(data1, data2);
}
/**
@@ -82,7 +93,7 @@ static int batadv_compare_tt(const struct hlist_node *node, const void *data2)
* @data: pointer to the tt_common_entry object to map
* @size: the size of the hash table
*
- * Returns the hash index where the object represented by 'data' should be
+ * Return: the hash index where the object represented by 'data' should be
* stored at.
*/
static inline u32 batadv_choose_tt(const void *data, u32 size)
@@ -103,7 +114,7 @@ static inline u32 batadv_choose_tt(const void *data, u32 size)
* @addr: the mac address of the client to look for
* @vid: VLAN identifier
*
- * Returns a pointer to the tt_common struct belonging to the searched client if
+ * Return: a pointer to the tt_common struct belonging to the searched client if
* found, NULL otherwise.
*/
static struct batadv_tt_common_entry *
@@ -131,7 +142,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr,
if (tt->vid != vid)
continue;
- if (!atomic_inc_not_zero(&tt->refcount))
+ if (!kref_get_unless_zero(&tt->refcount))
continue;
tt_tmp = tt;
@@ -148,7 +159,7 @@ batadv_tt_hash_find(struct batadv_hashtable *hash, const u8 *addr,
* @addr: the mac address of the client to look for
* @vid: VLAN identifier
*
- * Returns a pointer to the corresponding tt_local_entry struct if the client is
+ * Return: a pointer to the corresponding tt_local_entry struct if the client is
* found, NULL otherwise.
*/
static struct batadv_tt_local_entry *
@@ -173,7 +184,7 @@ batadv_tt_local_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
* @addr: the mac address of the client to look for
* @vid: VLAN identifier
*
- * Returns a pointer to the corresponding tt_global_entry struct if the client
+ * Return: a pointer to the corresponding tt_global_entry struct if the client
* is found, NULL otherwise.
*/
static struct batadv_tt_global_entry *
@@ -192,34 +203,68 @@ batadv_tt_global_hash_find(struct batadv_priv *bat_priv, const u8 *addr,
return tt_global_entry;
}
+/**
+ * batadv_tt_local_entry_release - release tt_local_entry from lists and queue
+ * for free after rcu grace period
+ * @ref: kref pointer of the nc_node
+ */
+static void batadv_tt_local_entry_release(struct kref *ref)
+{
+ struct batadv_tt_local_entry *tt_local_entry;
+
+ tt_local_entry = container_of(ref, struct batadv_tt_local_entry,
+ common.refcount);
+
+ kfree_rcu(tt_local_entry, common.rcu);
+}
+
+/**
+ * batadv_tt_local_entry_put - decrement the tt_local_entry refcounter and
+ * possibly release it
+ * @tt_local_entry: tt_local_entry to be free'd
+ */
static void
-batadv_tt_local_entry_free_ref(struct batadv_tt_local_entry *tt_local_entry)
+batadv_tt_local_entry_put(struct batadv_tt_local_entry *tt_local_entry)
+{
+ kref_put(&tt_local_entry->common.refcount,
+ batadv_tt_local_entry_release);
+}
+
+/**
+ * batadv_tt_global_entry_release - release tt_global_entry from lists and queue
+ * for free after rcu grace period
+ * @ref: kref pointer of the nc_node
+ */
+static void batadv_tt_global_entry_release(struct kref *ref)
{
- if (atomic_dec_and_test(&tt_local_entry->common.refcount))
- kfree_rcu(tt_local_entry, common.rcu);
+ struct batadv_tt_global_entry *tt_global_entry;
+
+ tt_global_entry = container_of(ref, struct batadv_tt_global_entry,
+ common.refcount);
+
+ batadv_tt_global_del_orig_list(tt_global_entry);
+ kfree_rcu(tt_global_entry, common.rcu);
}
/**
- * batadv_tt_global_entry_free_ref - decrement the refcounter for a
- * tt_global_entry and possibly free it
- * @tt_global_entry: the object to free
+ * batadv_tt_global_entry_put - decrement the tt_global_entry refcounter and
+ * possibly release it
+ * @tt_global_entry: tt_global_entry to be free'd
*/
static void
-batadv_tt_global_entry_free_ref(struct batadv_tt_global_entry *tt_global_entry)
+batadv_tt_global_entry_put(struct batadv_tt_global_entry *tt_global_entry)
{
- if (atomic_dec_and_test(&tt_global_entry->common.refcount)) {
- batadv_tt_global_del_orig_list(tt_global_entry);
- kfree_rcu(tt_global_entry, common.rcu);
- }
+ kref_put(&tt_global_entry->common.refcount,
+ batadv_tt_global_entry_release);
}
/**
* batadv_tt_global_hash_count - count the number of orig entries
- * @hash: hash table containing the tt entries
+ * @bat_priv: the bat priv with all the soft interface information
* @addr: the mac address of the client to count entries for
* @vid: VLAN identifier
*
- * Return the number of originators advertising the given address/data
+ * Return: the number of originators advertising the given address/data
* (excluding ourself).
*/
int batadv_tt_global_hash_count(struct batadv_priv *bat_priv,
@@ -233,25 +278,11 @@ int batadv_tt_global_hash_count(struct batadv_priv *bat_priv,
return 0;
count = atomic_read(&tt_global_entry->orig_list_count);
- batadv_tt_global_entry_free_ref(tt_global_entry);
+ batadv_tt_global_entry_put(tt_global_entry);
return count;
}
-static void batadv_tt_orig_list_entry_free_rcu(struct rcu_head *rcu)
-{
- struct batadv_tt_orig_list_entry *orig_entry;
-
- orig_entry = container_of(rcu, struct batadv_tt_orig_list_entry, rcu);
-
- /* We are in an rcu callback here, therefore we cannot use
- * batadv_orig_node_free_ref() and its call_rcu():
- * An rcu_barrier() wouldn't wait for that to finish
- */
- batadv_orig_node_free_ref_now(orig_entry->orig_node);
- kfree(orig_entry);
-}
-
/**
* batadv_tt_local_size_mod - change the size by v of the local table identified
* by vid
@@ -270,7 +301,7 @@ static void batadv_tt_local_size_mod(struct batadv_priv *bat_priv,
atomic_add(v, &vlan->tt.num_entries);
- batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_put(vlan);
}
/**
@@ -298,9 +329,9 @@ static void batadv_tt_local_size_dec(struct batadv_priv *bat_priv,
}
/**
- * batadv_tt_global_size_mod - change the size by v of the local table
- * identified by vid
- * @bat_priv: the bat priv with all the soft interface information
+ * batadv_tt_global_size_mod - change the size by v of the global table
+ * for orig_node identified by vid
+ * @orig_node: the originator for which the table has to be modified
* @vid: the VLAN identifier
* @v: the amount to sum to the global table size
*/
@@ -315,12 +346,14 @@ static void batadv_tt_global_size_mod(struct batadv_orig_node *orig_node,
if (atomic_add_return(v, &vlan->tt.num_entries) == 0) {
spin_lock_bh(&orig_node->vlan_list_lock);
- hlist_del_init_rcu(&vlan->list);
+ if (!hlist_unhashed(&vlan->list)) {
+ hlist_del_init_rcu(&vlan->list);
+ batadv_orig_node_vlan_put(vlan);
+ }
spin_unlock_bh(&orig_node->vlan_list_lock);
- batadv_orig_node_vlan_free_ref(vlan);
}
- batadv_orig_node_vlan_free_ref(vlan);
+ batadv_orig_node_vlan_put(vlan);
}
/**
@@ -347,13 +380,31 @@ static void batadv_tt_global_size_dec(struct batadv_orig_node *orig_node,
batadv_tt_global_size_mod(orig_node, vid, -1);
}
-static void
-batadv_tt_orig_list_entry_free_ref(struct batadv_tt_orig_list_entry *orig_entry)
+/**
+ * batadv_tt_orig_list_entry_release - release tt orig entry from lists and
+ * queue for free after rcu grace period
+ * @ref: kref pointer of the tt orig entry
+ */
+static void batadv_tt_orig_list_entry_release(struct kref *ref)
{
- if (!atomic_dec_and_test(&orig_entry->refcount))
- return;
+ struct batadv_tt_orig_list_entry *orig_entry;
- call_rcu(&orig_entry->rcu, batadv_tt_orig_list_entry_free_rcu);
+ orig_entry = container_of(ref, struct batadv_tt_orig_list_entry,
+ refcount);
+
+ batadv_orig_node_put(orig_entry->orig_node);
+ kfree_rcu(orig_entry, rcu);
+}
+
+/**
+ * batadv_tt_orig_list_entry_put - decrement the tt orig entry refcounter and
+ * possibly release it
+ * @orig_entry: tt orig entry to be free'd
+ */
+static void
+batadv_tt_orig_list_entry_put(struct batadv_tt_orig_list_entry *orig_entry)
+{
+ kref_put(&orig_entry->refcount, batadv_tt_orig_list_entry_release);
}
/**
@@ -435,7 +486,7 @@ unlock:
* batadv_tt_len - compute length in bytes of given number of tt changes
* @changes_num: number of tt changes
*
- * Returns computed length in bytes.
+ * Return: computed length in bytes.
*/
static int batadv_tt_len(int changes_num)
{
@@ -446,7 +497,7 @@ static int batadv_tt_len(int changes_num)
* batadv_tt_entries - compute the number of entries fitting in tt_len bytes
* @tt_len: available space
*
- * Returns the number of entries.
+ * Return: the number of entries.
*/
static u16 batadv_tt_entries(u16 tt_len)
{
@@ -458,7 +509,7 @@ static u16 batadv_tt_entries(u16 tt_len)
* size when transmitted over the air
* @bat_priv: the bat priv with all the soft interface information
*
- * Returns local translation table size in bytes.
+ * Return: local translation table size in bytes.
*/
static int batadv_tt_local_table_transmit_size(struct batadv_priv *bat_priv)
{
@@ -510,7 +561,7 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv,
batadv_hash_remove(bat_priv->tt.global_hash, batadv_compare_tt,
batadv_choose_tt, &tt_global->common);
- batadv_tt_global_entry_free_ref(tt_global);
+ batadv_tt_global_entry_put(tt_global);
}
/**
@@ -524,7 +575,7 @@ static void batadv_tt_global_free(struct batadv_priv *bat_priv,
* @mark: the value contained in the skb->mark field of the received packet (if
* any)
*
- * Returns true if the client was successfully added, false otherwise.
+ * Return: true if the client was successfully added, false otherwise.
*/
bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
unsigned short vid, int ifindex, u32 mark)
@@ -618,7 +669,8 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
tt_local->common.vid = vid;
if (batadv_is_wifi_netdev(in_dev))
tt_local->common.flags |= BATADV_TT_CLIENT_WIFI;
- atomic_set(&tt_local->common.refcount, 2);
+ kref_init(&tt_local->common.refcount);
+ kref_get(&tt_local->common.refcount);
tt_local->last_seen = jiffies;
tt_local->common.added_at = tt_local->last_seen;
@@ -635,8 +687,8 @@ bool batadv_tt_local_add(struct net_device *soft_iface, const u8 *addr,
if (unlikely(hash_added != 0)) {
/* remove the reference for the hash */
- batadv_tt_local_entry_free_ref(tt_local);
- batadv_softif_vlan_free_ref(vlan);
+ batadv_tt_local_entry_put(tt_local);
+ batadv_softif_vlan_put(vlan);
goto out;
}
@@ -702,9 +754,9 @@ out:
if (in_dev)
dev_put(in_dev);
if (tt_local)
- batadv_tt_local_entry_free_ref(tt_local);
+ batadv_tt_local_entry_put(tt_local);
if (tt_global)
- batadv_tt_global_entry_free_ref(tt_global);
+ batadv_tt_global_entry_put(tt_global);
return ret;
}
@@ -719,12 +771,11 @@ out:
* function reserves the amount of space needed to send the entire global TT
* table. In case of success the value is updated with the real amount of
* reserved bytes
-
* Allocate the needed amount of memory for the entire TT TVLV and write its
* header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data
* objects, one per active VLAN served by the originator node.
*
- * Return the size of the allocated buffer or 0 in case of failure.
+ * Return: the size of the allocated buffer or 0 in case of failure.
*/
static u16
batadv_tt_prepare_tvlv_global_data(struct batadv_orig_node *orig_node,
@@ -798,7 +849,7 @@ out:
* header made up by one tvlv_tt_data object and a series of tvlv_tt_vlan_data
* objects, one per active VLAN.
*
- * Return the size of the allocated buffer or 0 in case of failure.
+ * Return: the size of the allocated buffer or 0 in case of failure.
*/
static u16
batadv_tt_prepare_tvlv_local_data(struct batadv_priv *bat_priv,
@@ -1003,13 +1054,13 @@ int batadv_tt_local_seq_print_text(struct seq_file *seq, void *offset)
no_purge ? 0 : last_seen_msecs,
vlan->tt.crc);
- batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_put(vlan);
}
rcu_read_unlock();
}
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
return 0;
}
@@ -1040,7 +1091,7 @@ batadv_tt_local_set_pending(struct batadv_priv *bat_priv,
* @message: message to append to the log on deletion
* @roaming: true if the deletion is due to a roaming event
*
- * Returns the flags assigned to the local entry before being deleted
+ * Return: the flags assigned to the local entry before being deleted
*/
u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr,
unsigned short vid, const char *message,
@@ -1086,19 +1137,19 @@ u16 batadv_tt_local_remove(struct batadv_priv *bat_priv, const u8 *addr,
goto out;
/* extra call to free the local tt entry */
- batadv_tt_local_entry_free_ref(tt_local_entry);
+ batadv_tt_local_entry_put(tt_local_entry);
/* decrease the reference held for this vlan */
vlan = batadv_softif_vlan_get(bat_priv, vid);
if (!vlan)
goto out;
- batadv_softif_vlan_free_ref(vlan);
- batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_put(vlan);
+ batadv_softif_vlan_put(vlan);
out:
if (tt_local_entry)
- batadv_tt_local_entry_free_ref(tt_local_entry);
+ batadv_tt_local_entry_put(tt_local_entry);
return curr_flags;
}
@@ -1194,11 +1245,11 @@ static void batadv_tt_local_table_free(struct batadv_priv *bat_priv)
vlan = batadv_softif_vlan_get(bat_priv,
tt_common_entry->vid);
if (vlan) {
- batadv_softif_vlan_free_ref(vlan);
- batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_put(vlan);
+ batadv_softif_vlan_put(vlan);
}
- batadv_tt_local_entry_free_ref(tt_local);
+ batadv_tt_local_entry_put(tt_local);
}
spin_unlock_bh(list_lock);
}
@@ -1240,10 +1291,16 @@ static void batadv_tt_changes_list_free(struct batadv_priv *bat_priv)
spin_unlock_bh(&bat_priv->tt.changes_list_lock);
}
-/* retrieves the orig_tt_list_entry belonging to orig_node from the
+/**
+ * batadv_tt_global_orig_entry_find - find a TT orig_list_entry
+ * @entry: the TT global entry where the orig_list_entry has to be
+ * extracted from
+ * @orig_node: the originator for which the orig_list_entry has to be found
+ *
+ * retrieve the orig_tt_list_entry belonging to orig_node from the
* batadv_tt_global_entry list
*
- * returns it with an increased refcounter, NULL if not found
+ * Return: it with an increased refcounter, NULL if not found
*/
static struct batadv_tt_orig_list_entry *
batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry,
@@ -1257,7 +1314,7 @@ batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry,
hlist_for_each_entry_rcu(tmp_orig_entry, head, list) {
if (tmp_orig_entry->orig_node != orig_node)
continue;
- if (!atomic_inc_not_zero(&tmp_orig_entry->refcount))
+ if (!kref_get_unless_zero(&tmp_orig_entry->refcount))
continue;
orig_entry = tmp_orig_entry;
@@ -1268,8 +1325,15 @@ batadv_tt_global_orig_entry_find(const struct batadv_tt_global_entry *entry,
return orig_entry;
}
-/* find out if an orig_node is already in the list of a tt_global_entry.
- * returns true if found, false otherwise
+/**
+ * batadv_tt_global_entry_has_orig - check if a TT global entry is also handled
+ * by a given originator
+ * @entry: the TT global entry to check
+ * @orig_node: the originator to search in the list
+ *
+ * find out if an orig_node is already in the list of a tt_global_entry.
+ *
+ * Return: true if found, false otherwise
*/
static bool
batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry,
@@ -1281,7 +1345,7 @@ batadv_tt_global_entry_has_orig(const struct batadv_tt_global_entry *entry,
orig_entry = batadv_tt_global_orig_entry_find(entry, orig_node);
if (orig_entry) {
found = true;
- batadv_tt_orig_list_entry_free_ref(orig_entry);
+ batadv_tt_orig_list_entry_put(orig_entry);
}
return found;
@@ -1307,11 +1371,12 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global,
goto out;
INIT_HLIST_NODE(&orig_entry->list);
- atomic_inc(&orig_node->refcount);
+ kref_get(&orig_node->refcount);
batadv_tt_global_size_inc(orig_node, tt_global->common.vid);
orig_entry->orig_node = orig_node;
orig_entry->ttvn = ttvn;
- atomic_set(&orig_entry->refcount, 2);
+ kref_init(&orig_entry->refcount);
+ kref_get(&orig_entry->refcount);
spin_lock_bh(&tt_global->list_lock);
hlist_add_head_rcu(&orig_entry->list,
@@ -1321,7 +1386,7 @@ batadv_tt_global_orig_entry_add(struct batadv_tt_global_entry *tt_global,
out:
if (orig_entry)
- batadv_tt_orig_list_entry_free_ref(orig_entry);
+ batadv_tt_orig_list_entry_put(orig_entry);
}
/**
@@ -1341,7 +1406,7 @@ out:
*
* The caller must hold orig_node refcount.
*
- * Return true if the new entry has been added, false otherwise
+ * Return: true if the new entry has been added, false otherwise
*/
static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
@@ -1387,7 +1452,8 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
*/
if (flags & BATADV_TT_CLIENT_ROAM)
tt_global_entry->roam_at = jiffies;
- atomic_set(&common->refcount, 2);
+ kref_init(&common->refcount);
+ kref_get(&common->refcount);
common->added_at = jiffies;
INIT_HLIST_HEAD(&tt_global_entry->orig_list);
@@ -1401,7 +1467,7 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
if (unlikely(hash_added != 0)) {
/* remove the reference for the hash */
- batadv_tt_global_entry_free_ref(tt_global_entry);
+ batadv_tt_global_entry_put(tt_global_entry);
goto out_remove;
}
} else {
@@ -1427,15 +1493,21 @@ static bool batadv_tt_global_add(struct batadv_priv *bat_priv,
}
/* if the client was temporary added before receiving the first
- * OGM announcing it, we have to clear the TEMP flag
+ * OGM announcing it, we have to clear the TEMP flag. Also,
+ * remove the previous temporary orig node and re-add it
+ * if required. If the orig entry changed, the new one which
+ * is a non-temporary entry is preferred.
*/
- common->flags &= ~BATADV_TT_CLIENT_TEMP;
+ if (common->flags & BATADV_TT_CLIENT_TEMP) {
+ batadv_tt_global_del_orig_list(tt_global_entry);
+ common->flags &= ~BATADV_TT_CLIENT_TEMP;
+ }
/* the change can carry possible "attribute" flags like the
* TT_CLIENT_WIFI, therefore they have to be copied in the
* client entry
*/
- tt_global_entry->common.flags |= flags;
+ common->flags |= flags;
/* If there is the BATADV_TT_CLIENT_ROAM flag set, there is only
* one originator left in the list and we previously received a
@@ -1481,9 +1553,9 @@ out_remove:
out:
if (tt_global_entry)
- batadv_tt_global_entry_free_ref(tt_global_entry);
+ batadv_tt_global_entry_put(tt_global_entry);
if (tt_local_entry)
- batadv_tt_local_entry_free_ref(tt_local_entry);
+ batadv_tt_local_entry_put(tt_local_entry);
return ret;
}
@@ -1493,7 +1565,7 @@ out:
* @tt_global_entry: global translation table entry to be analyzed
*
* This functon assumes the caller holds rcu_read_lock().
- * Returns best originator list entry or NULL on errors.
+ * Return: best originator list entry or NULL on errors.
*/
static struct batadv_tt_orig_list_entry *
batadv_transtable_best_orig(struct batadv_priv *bat_priv,
@@ -1514,20 +1586,20 @@ batadv_transtable_best_orig(struct batadv_priv *bat_priv,
if (best_router &&
bao->bat_neigh_cmp(router, BATADV_IF_DEFAULT,
best_router, BATADV_IF_DEFAULT) <= 0) {
- batadv_neigh_node_free_ref(router);
+ batadv_neigh_node_put(router);
continue;
}
/* release the refcount for the "old" best */
if (best_router)
- batadv_neigh_node_free_ref(best_router);
+ batadv_neigh_node_put(best_router);
best_entry = orig_entry;
best_router = router;
}
if (best_router)
- batadv_neigh_node_free_ref(best_router);
+ batadv_neigh_node_put(best_router);
return best_entry;
}
@@ -1580,7 +1652,7 @@ batadv_tt_global_print_entry(struct batadv_priv *bat_priv,
((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'),
((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.'));
- batadv_orig_node_vlan_free_ref(vlan);
+ batadv_orig_node_vlan_put(vlan);
}
print_list:
@@ -1612,7 +1684,7 @@ print_list:
((flags & BATADV_TT_CLIENT_ISOLA) ? 'I' : '.'),
((flags & BATADV_TT_CLIENT_TEMP) ? 'T' : '.'));
- batadv_orig_node_vlan_free_ref(vlan);
+ batadv_orig_node_vlan_put(vlan);
}
}
@@ -1653,7 +1725,7 @@ int batadv_tt_global_seq_print_text(struct seq_file *seq, void *offset)
}
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
return 0;
}
@@ -1681,7 +1753,7 @@ _batadv_tt_global_del_orig_entry(struct batadv_tt_global_entry *tt_global_entry,
* being part of a list
*/
hlist_del_rcu(&orig_entry->list);
- batadv_tt_orig_list_entry_free_ref(orig_entry);
+ batadv_tt_orig_list_entry_put(orig_entry);
}
/* deletes the orig list of a tt_global_entry */
@@ -1837,9 +1909,9 @@ static void batadv_tt_global_del(struct batadv_priv *bat_priv,
out:
if (tt_global_entry)
- batadv_tt_global_entry_free_ref(tt_global_entry);
+ batadv_tt_global_entry_put(tt_global_entry);
if (local_entry)
- batadv_tt_local_entry_free_ref(local_entry);
+ batadv_tt_local_entry_put(local_entry);
}
/**
@@ -1893,7 +1965,7 @@ void batadv_tt_global_del_orig(struct batadv_priv *bat_priv,
tt_global->common.addr,
BATADV_PRINT_VID(vid), message);
hlist_del_rcu(&tt_common_entry->hash_entry);
- batadv_tt_global_entry_free_ref(tt_global);
+ batadv_tt_global_entry_put(tt_global);
}
}
spin_unlock_bh(list_lock);
@@ -1956,7 +2028,7 @@ static void batadv_tt_global_purge(struct batadv_priv *bat_priv)
hlist_del_rcu(&tt_common->hash_entry);
- batadv_tt_global_entry_free_ref(tt_global);
+ batadv_tt_global_entry_put(tt_global);
}
spin_unlock_bh(list_lock);
}
@@ -1988,7 +2060,7 @@ static void batadv_tt_global_table_free(struct batadv_priv *bat_priv)
tt_global = container_of(tt_common_entry,
struct batadv_tt_global_entry,
common);
- batadv_tt_global_entry_free_ref(tt_global);
+ batadv_tt_global_entry_put(tt_global);
}
spin_unlock_bh(list_lock);
}
@@ -2023,7 +2095,7 @@ _batadv_is_ap_isolated(struct batadv_tt_local_entry *tt_local_entry,
* @addr: mac address of the destination client
* @vid: VLAN identifier
*
- * Returns a pointer to the originator that was selected as destination in the
+ * Return: a pointer to the originator that was selected as destination in the
* mesh for contacting the client 'addr', NULL otherwise.
* In case of multiple originators serving the same client, the function returns
* the best one (best in terms of metric towards the destination node).
@@ -2063,15 +2135,15 @@ struct batadv_orig_node *batadv_transtable_search(struct batadv_priv *bat_priv,
/* found anything? */
if (best_entry)
orig_node = best_entry->orig_node;
- if (orig_node && !atomic_inc_not_zero(&orig_node->refcount))
+ if (orig_node && !kref_get_unless_zero(&orig_node->refcount))
orig_node = NULL;
rcu_read_unlock();
out:
if (tt_global_entry)
- batadv_tt_global_entry_free_ref(tt_global_entry);
+ batadv_tt_global_entry_put(tt_global_entry);
if (tt_local_entry)
- batadv_tt_local_entry_free_ref(tt_local_entry);
+ batadv_tt_local_entry_put(tt_local_entry);
return orig_node;
}
@@ -2098,7 +2170,7 @@ out:
* because the XOR operation can combine them all while trying to reduce the
* noise as much as possible.
*
- * Returns the checksum of the global table of a given originator.
+ * Return: the checksum of the global table of a given originator.
*/
static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
@@ -2175,7 +2247,7 @@ static u32 batadv_tt_global_crc(struct batadv_priv *bat_priv,
* For details about the computation, please refer to the documentation for
* batadv_tt_global_crc().
*
- * Returns the checksum of the local table
+ * Return: the checksum of the local table
*/
static u32 batadv_tt_local_crc(struct batadv_priv *bat_priv,
unsigned short vid)
@@ -2281,7 +2353,7 @@ static void batadv_tt_req_purge(struct batadv_priv *bat_priv)
* @bat_priv: the bat priv with all the soft interface information
* @orig_node: orig node this request is being issued for
*
- * Returns the pointer to the new tt_req_node struct if no request
+ * Return: the pointer to the new tt_req_node struct if no request
* has already been issued for this orig_node, NULL otherwise.
*/
static struct batadv_tt_req_node *
@@ -2316,7 +2388,7 @@ unlock:
* @entry_ptr: to be checked local tt entry
* @data_ptr: not used but definition required to satisfy the callback prototype
*
- * Returns 1 if the entry is a valid, 0 otherwise.
+ * Return: 1 if the entry is a valid, 0 otherwise.
*/
static int batadv_tt_local_valid(const void *entry_ptr, const void *data_ptr)
{
@@ -2400,9 +2472,8 @@ static void batadv_tt_tvlv_generate(struct batadv_priv *bat_priv,
* @orig_node: originator for which the CRCs have to be checked
* @tt_vlan: pointer to the first tvlv VLAN entry
* @num_vlan: number of tvlv VLAN entries
- * @create: if true, create VLAN objects if not found
*
- * Return true if all the received CRCs match the locally stored ones, false
+ * Return: true if all the received CRCs match the locally stored ones, false
* otherwise
*/
static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node,
@@ -2411,8 +2482,8 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node,
{
struct batadv_tvlv_tt_vlan_data *tt_vlan_tmp;
struct batadv_orig_node_vlan *vlan;
+ int i, orig_num_vlan;
u32 crc;
- int i;
/* check if each received CRC matches the locally stored one */
for (i = 0; i < num_vlan; i++) {
@@ -2432,12 +2503,24 @@ static bool batadv_tt_global_check_crc(struct batadv_orig_node *orig_node,
return false;
crc = vlan->tt.crc;
- batadv_orig_node_vlan_free_ref(vlan);
+ batadv_orig_node_vlan_put(vlan);
if (crc != ntohl(tt_vlan_tmp->crc))
return false;
}
+ /* check if any excess VLANs exist locally for the originator
+ * which are not mentioned in the TVLV from the originator.
+ */
+ rcu_read_lock();
+ orig_num_vlan = 0;
+ hlist_for_each_entry_rcu(vlan, &orig_node->vlan_list, list)
+ orig_num_vlan++;
+ rcu_read_unlock();
+
+ if (orig_num_vlan > num_vlan)
+ return false;
+
return true;
}
@@ -2493,6 +2576,8 @@ static void batadv_tt_global_update_crc(struct batadv_priv *bat_priv,
* @num_vlan: number of tvlv VLAN entries
* @full_table: ask for the entire translation table if true, while only for the
* last TT diff otherwise
+ *
+ * Return: true if the TT Request was sent, false otherwise
*/
static int batadv_send_tt_request(struct batadv_priv *bat_priv,
struct batadv_orig_node *dst_orig_node,
@@ -2553,7 +2638,7 @@ static int batadv_send_tt_request(struct batadv_priv *bat_priv,
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
if (ret && tt_req_node) {
spin_lock_bh(&bat_priv->tt.req_list_lock);
/* hlist_del_init() verifies tt_req_node still is in the list */
@@ -2573,7 +2658,7 @@ out:
* @req_src: mac address of tt request sender
* @req_dst: mac address of tt request recipient
*
- * Returns true if tt request reply was sent, false otherwise.
+ * Return: true if tt request reply was sent, false otherwise.
*/
static bool batadv_send_other_tt_response(struct batadv_priv *bat_priv,
struct batadv_tvlv_tt_data *tt_data,
@@ -2691,9 +2776,9 @@ unlock:
out:
if (res_dst_orig_node)
- batadv_orig_node_free_ref(res_dst_orig_node);
+ batadv_orig_node_put(res_dst_orig_node);
if (req_dst_orig_node)
- batadv_orig_node_free_ref(req_dst_orig_node);
+ batadv_orig_node_put(req_dst_orig_node);
kfree(tvlv_tt_data);
return ret;
}
@@ -2705,7 +2790,7 @@ out:
* @tt_data: tt data containing the tt request information
* @req_src: mac address of tt request sender
*
- * Returns true if tt request reply was sent, false otherwise.
+ * Return: true if tt request reply was sent, false otherwise.
*/
static bool batadv_send_my_tt_response(struct batadv_priv *bat_priv,
struct batadv_tvlv_tt_data *tt_data,
@@ -2808,9 +2893,9 @@ unlock:
out:
spin_unlock_bh(&bat_priv->tt.commit_lock);
if (orig_node)
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
kfree(tvlv_tt_data);
/* The packet was for this host, so it doesn't need to be re-routed */
return true;
@@ -2823,7 +2908,7 @@ out:
* @req_src: mac address of tt request sender
* @req_dst: mac address of tt request recipient
*
- * Returns true if tt request reply was sent, false otherwise.
+ * Return: true if tt request reply was sent, false otherwise.
*/
static bool batadv_send_tt_response(struct batadv_priv *bat_priv,
struct batadv_tvlv_tt_data *tt_data,
@@ -2896,7 +2981,7 @@ static void batadv_tt_fill_gtable(struct batadv_priv *bat_priv,
out:
if (orig_node)
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
}
static void batadv_tt_update_changes(struct batadv_priv *bat_priv,
@@ -2918,7 +3003,7 @@ static void batadv_tt_update_changes(struct batadv_priv *bat_priv,
* @addr: the mac address of the client to check
* @vid: VLAN identifier
*
- * Returns true if the client is served by this node, false otherwise.
+ * Return: true if the client is served by this node, false otherwise.
*/
bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr,
unsigned short vid)
@@ -2938,7 +3023,7 @@ bool batadv_is_my_client(struct batadv_priv *bat_priv, const u8 *addr,
ret = true;
out:
if (tt_local_entry)
- batadv_tt_local_entry_free_ref(tt_local_entry);
+ batadv_tt_local_entry_put(tt_local_entry);
return ret;
}
@@ -3002,7 +3087,7 @@ static void batadv_handle_tt_response(struct batadv_priv *bat_priv,
spin_unlock_bh(&bat_priv->tt.req_list_lock);
out:
if (orig_node)
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
}
static void batadv_tt_roam_list_free(struct batadv_priv *bat_priv)
@@ -3035,11 +3120,16 @@ static void batadv_tt_roam_purge(struct batadv_priv *bat_priv)
spin_unlock_bh(&bat_priv->tt.roam_list_lock);
}
-/* This function checks whether the client already reached the
+/**
+ * batadv_tt_check_roam_count - check if a client has roamed too frequently
+ * @bat_priv: the bat priv with all the soft interface information
+ * @client: mac address of the roaming client
+ *
+ * This function checks whether the client already reached the
* maximum number of possible roaming phases. In this case the ROAMING_ADV
* will not be sent.
*
- * returns true if the ROAMING_ADV can be sent, false otherwise
+ * Return: true if the ROAMING_ADV can be sent, false otherwise
*/
static bool batadv_tt_check_roam_count(struct batadv_priv *bat_priv, u8 *client)
{
@@ -3128,7 +3218,7 @@ static void batadv_send_roam_adv(struct batadv_priv *bat_priv, u8 *client,
out:
if (primary_if)
- batadv_hardif_free_ref(primary_if);
+ batadv_hardif_put(primary_if);
}
static void batadv_tt_purge(struct work_struct *work)
@@ -3252,11 +3342,11 @@ static void batadv_tt_local_purge_pending_clients(struct batadv_priv *bat_priv)
/* decrease the reference held for this vlan */
vlan = batadv_softif_vlan_get(bat_priv, tt_common->vid);
if (vlan) {
- batadv_softif_vlan_free_ref(vlan);
- batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_put(vlan);
+ batadv_softif_vlan_put(vlan);
}
- batadv_tt_local_entry_free_ref(tt_local);
+ batadv_tt_local_entry_put(tt_local);
}
spin_unlock_bh(list_lock);
}
@@ -3319,7 +3409,10 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
bool ret = false;
vlan = batadv_softif_vlan_get(bat_priv, vid);
- if (!vlan || !atomic_read(&vlan->ap_isolation))
+ if (!vlan)
+ return false;
+
+ if (!atomic_read(&vlan->ap_isolation))
goto out;
tt_local_entry = batadv_tt_local_hash_find(bat_priv, dst, vid);
@@ -3336,12 +3429,11 @@ bool batadv_is_ap_isolated(struct batadv_priv *bat_priv, u8 *src, u8 *dst,
ret = true;
out:
- if (vlan)
- batadv_softif_vlan_free_ref(vlan);
+ batadv_softif_vlan_put(vlan);
if (tt_global_entry)
- batadv_tt_global_entry_free_ref(tt_global_entry);
+ batadv_tt_global_entry_put(tt_global_entry);
if (tt_local_entry)
- batadv_tt_local_entry_free_ref(tt_local_entry);
+ batadv_tt_local_entry_put(tt_local_entry);
return ret;
}
@@ -3349,13 +3441,12 @@ out:
* batadv_tt_update_orig - update global translation table with new tt
* information received via ogms
* @bat_priv: the bat priv with all the soft interface information
- * @orig: the orig_node of the ogm
- * @tt_vlan: pointer to the first tvlv VLAN entry
+ * @orig_node: the orig_node of the ogm
+ * @tt_buff: pointer to the first tvlv VLAN entry
* @tt_num_vlan: number of tvlv VLAN entries
* @tt_change: pointer to the first entry in the TT buffer
* @tt_num_changes: number of tt changes inside the tt buffer
* @ttvn: translation table version number of this changeset
- * @tt_crc: crc32 checksum of orig node's translation table
*/
static void batadv_tt_update_orig(struct batadv_priv *bat_priv,
struct batadv_orig_node *orig_node,
@@ -3437,7 +3528,7 @@ request_table:
* @addr: the mac address of the client to check
* @vid: VLAN identifier
*
- * Returns true if we know that the client has moved from its old originator
+ * Return: true if we know that the client has moved from its old originator
* to another one. This entry is still kept for consistency purposes and will be
* deleted later by a DEL or because of timeout
*/
@@ -3452,7 +3543,7 @@ bool batadv_tt_global_client_is_roaming(struct batadv_priv *bat_priv,
goto out;
ret = tt_global_entry->common.flags & BATADV_TT_CLIENT_ROAM;
- batadv_tt_global_entry_free_ref(tt_global_entry);
+ batadv_tt_global_entry_put(tt_global_entry);
out:
return ret;
}
@@ -3463,7 +3554,7 @@ out:
* @addr: the mac address of the local client to query
* @vid: VLAN identifier
*
- * Returns true if the local client is known to be roaming (it is not served by
+ * Return: true if the local client is known to be roaming (it is not served by
* this node anymore) or not. If yes, the client is still present in the table
* to keep the latter consistent with the node TTVN
*/
@@ -3478,7 +3569,7 @@ bool batadv_tt_local_client_is_roaming(struct batadv_priv *bat_priv,
goto out;
ret = tt_local_entry->common.flags & BATADV_TT_CLIENT_ROAM;
- batadv_tt_local_entry_free_ref(tt_local_entry);
+ batadv_tt_local_entry_put(tt_local_entry);
out:
return ret;
}
@@ -3592,7 +3683,7 @@ static void batadv_tt_tvlv_ogm_handler_v1(struct batadv_priv *bat_priv,
* @tvlv_value: tvlv buffer containing the tt data
* @tvlv_value_len: tvlv buffer length
*
- * Returns NET_RX_DROP if the tt tvlv is to be re-routed, NET_RX_SUCCESS
+ * Return: NET_RX_DROP if the tt tvlv is to be re-routed, NET_RX_SUCCESS
* otherwise.
*/
static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
@@ -3673,7 +3764,7 @@ static int batadv_tt_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
* @tvlv_value: tvlv buffer containing the tt data
* @tvlv_value_len: tvlv buffer length
*
- * Returns NET_RX_DROP if the tt roam tvlv is to be re-routed, NET_RX_SUCCESS
+ * Return: NET_RX_DROP if the tt roam tvlv is to be re-routed, NET_RX_SUCCESS
* otherwise.
*/
static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
@@ -3711,7 +3802,7 @@ static int batadv_roam_tvlv_unicast_handler_v1(struct batadv_priv *bat_priv,
out:
if (orig_node)
- batadv_orig_node_free_ref(orig_node);
+ batadv_orig_node_put(orig_node);
return NET_RX_SUCCESS;
}
@@ -3719,7 +3810,7 @@ out:
* batadv_tt_init - initialise the translation table internals
* @bat_priv: the bat priv with all the soft interface information
*
- * Return 0 on success or negative error number in case of failure.
+ * Return: 0 on success or negative error number in case of failure.
*/
int batadv_tt_init(struct batadv_priv *bat_priv)
{
@@ -3757,7 +3848,7 @@ int batadv_tt_init(struct batadv_priv *bat_priv)
* @addr: the mac address of the client
* @vid: the identifier of the VLAN where this client is connected
*
- * Returns true if the client is marked with the TT_CLIENT_ISOLA flag, false
+ * Return: true if the client is marked with the TT_CLIENT_ISOLA flag, false
* otherwise
*/
bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
@@ -3772,7 +3863,7 @@ bool batadv_tt_global_is_isolated(struct batadv_priv *bat_priv,
ret = tt->common.flags & BATADV_TT_CLIENT_ISOLA;
- batadv_tt_global_entry_free_ref(tt);
+ batadv_tt_global_entry_put(tt);
return ret;
}
diff --git a/net/batman-adv/translation-table.h b/net/batman-adv/translation-table.h
index abd8e116e5fb..7c7e2c006bfe 100644
--- a/net/batman-adv/translation-table.h
+++ b/net/batman-adv/translation-table.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich, Antonio Quartulli
*
diff --git a/net/batman-adv/types.h b/net/batman-adv/types.h
index d260efd70499..9abfb3e73c34 100644
--- a/net/batman-adv/types.h
+++ b/net/batman-adv/types.h
@@ -1,4 +1,4 @@
-/* Copyright (C) 2007-2015 B.A.T.M.A.N. contributors:
+/* Copyright (C) 2007-2016 B.A.T.M.A.N. contributors:
*
* Marek Lindner, Simon Wunderlich
*
@@ -22,9 +22,11 @@
#error only "main.h" can be included directly
#endif
+#include <linux/average.h>
#include <linux/bitops.h>
#include <linux/compiler.h>
#include <linux/if_ether.h>
+#include <linux/kref.h>
#include <linux/netdevice.h>
#include <linux/sched.h> /* for linux/wait.h */
#include <linux/spinlock.h>
@@ -73,7 +75,7 @@ enum batadv_dhcp_recipient {
#define BATADV_TT_SYNC_MASK 0x00F0
/**
- * struct batadv_hard_iface_bat_iv - per hard interface B.A.T.M.A.N. IV data
+ * struct batadv_hard_iface_bat_iv - per hard-interface B.A.T.M.A.N. IV data
* @ogm_buff: buffer holding the OGM packet
* @ogm_buff_len: length of the OGM packet buffer
* @ogm_seqno: OGM sequence number - used to identify each OGM
@@ -85,6 +87,36 @@ struct batadv_hard_iface_bat_iv {
};
/**
+ * enum batadv_v_hard_iface_flags - interface flags useful to B.A.T.M.A.N. V
+ * @BATADV_FULL_DUPLEX: tells if the connection over this link is full-duplex
+ * @BATADV_WARNING_DEFAULT: tells whether we have warned the user that no
+ * throughput data is available for this interface and that default values are
+ * assumed.
+ */
+enum batadv_v_hard_iface_flags {
+ BATADV_FULL_DUPLEX = BIT(0),
+ BATADV_WARNING_DEFAULT = BIT(1),
+};
+
+/**
+ * struct batadv_hard_iface_bat_v - per hard-interface B.A.T.M.A.N. V data
+ * @elp_interval: time interval between two ELP transmissions
+ * @elp_seqno: current ELP sequence number
+ * @elp_skb: base skb containing the ELP message to send
+ * @elp_wq: workqueue used to schedule ELP transmissions
+ * @throughput_override: throughput override to disable link auto-detection
+ * @flags: interface specific flags
+ */
+struct batadv_hard_iface_bat_v {
+ atomic_t elp_interval;
+ atomic_t elp_seqno;
+ struct sk_buff *elp_skb;
+ struct delayed_work elp_wq;
+ atomic_t throughput_override;
+ u8 flags;
+};
+
+/**
* struct batadv_hard_iface - network device known to batman-adv
* @list: list node for batadv_hardif_list
* @if_num: identificator of the interface
@@ -97,9 +129,12 @@ struct batadv_hard_iface_bat_iv {
* batman-adv for this interface
* @soft_iface: the batman-adv interface which uses this network interface
* @rcu: struct used for freeing in an RCU-safe manner
- * @bat_iv: BATMAN IV specific per hard interface data
- * @cleanup_work: work queue callback item for hard interface deinit
+ * @bat_iv: per hard-interface B.A.T.M.A.N. IV data
+ * @bat_v: per hard-interface B.A.T.M.A.N. V data
+ * @cleanup_work: work queue callback item for hard-interface deinit
* @debug_dir: dentry for nc subdir in batman-adv directory in debugfs
+ * @neigh_list: list of unique single hop neighbors via this interface
+ * @neigh_list_lock: lock protecting neigh_list
*/
struct batadv_hard_iface {
struct list_head list;
@@ -108,22 +143,29 @@ struct batadv_hard_iface {
struct net_device *net_dev;
u8 num_bcasts;
struct kobject *hardif_obj;
- atomic_t refcount;
+ struct kref refcount;
struct packet_type batman_adv_ptype;
struct net_device *soft_iface;
struct rcu_head rcu;
struct batadv_hard_iface_bat_iv bat_iv;
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+ struct batadv_hard_iface_bat_v bat_v;
+#endif
struct work_struct cleanup_work;
struct dentry *debug_dir;
+ struct hlist_head neigh_list;
+ /* neigh_list_lock protects: neigh_list */
+ spinlock_t neigh_list_lock;
};
/**
* struct batadv_orig_ifinfo - originator info per outgoing interface
* @list: list node for orig_node::ifinfo_list
- * @if_outgoing: pointer to outgoing hard interface
+ * @if_outgoing: pointer to outgoing hard-interface
* @router: router that should be used to reach this originator
* @last_real_seqno: last and best known sequence number
* @last_ttl: ttl of last received packet
+ * @last_seqno_forwarded: seqno of the OGM which was forwarded last
* @batman_seqno_reset: time when the batman seqno window was reset
* @refcount: number of contexts the object is used
* @rcu: struct used for freeing in an RCU-safe manner
@@ -134,8 +176,9 @@ struct batadv_orig_ifinfo {
struct batadv_neigh_node __rcu *router; /* rcu protected pointer */
u32 last_real_seqno;
u8 last_ttl;
+ u32 last_seqno_forwarded;
unsigned long batman_seqno_reset;
- atomic_t refcount;
+ struct kref refcount;
struct rcu_head rcu;
};
@@ -191,13 +234,13 @@ struct batadv_orig_node_vlan {
unsigned short vid;
struct batadv_vlan_tt tt;
struct hlist_node list;
- atomic_t refcount;
+ struct kref refcount;
struct rcu_head rcu;
};
/**
* struct batadv_orig_bat_iv - B.A.T.M.A.N. IV private orig_node members
- * @bcast_own: set of bitfields (one per hard interface) where each one counts
+ * @bcast_own: set of bitfields (one per hard-interface) where each one counts
* the number of our OGMs this orig_node rebroadcasted "back" to us (relative
* to last_real_seqno). Every bitfield is BATADV_TQ_LOCAL_WINDOW_SIZE bits long.
* @bcast_own_sum: sum of bcast_own
@@ -218,12 +261,12 @@ struct batadv_orig_bat_iv {
* @orig: originator ethernet address
* @ifinfo_list: list for routers per outgoing interface
* @last_bonding_candidate: pointer to last ifinfo of last used router
- * @batadv_dat_addr_t: address of the orig node in the distributed hash
+ * @dat_addr: address of the orig node in the distributed hash
* @last_seen: time when last packet from this node was received
* @bcast_seqno_reset: time when the broadcast seqno window was reset
* @mcast_handler_lock: synchronizes mcast-capability and -flag changes
* @mcast_flags: multicast flags announced by the orig node
- * @mcast_want_all_unsnoop_node: a list node for the
+ * @mcast_want_all_unsnoopables_node: a list node for the
* mcast.want_all_unsnoopables list
* @mcast_want_all_ipv4_node: a list node for the mcast.want_all_ipv4 list
* @mcast_want_all_ipv6_node: a list node for the mcast.want_all_ipv6 list
@@ -293,7 +336,7 @@ struct batadv_orig_node {
struct batadv_priv *bat_priv;
/* bcast_seqno_lock protects: bcast_bits & last_bcast_seqno */
spinlock_t bcast_seqno_lock;
- atomic_t refcount;
+ struct kref refcount;
struct rcu_head rcu;
#ifdef CONFIG_BATMAN_ADV_NC
struct list_head in_coding_list;
@@ -336,7 +379,48 @@ struct batadv_gw_node {
struct batadv_orig_node *orig_node;
u32 bandwidth_down;
u32 bandwidth_up;
- atomic_t refcount;
+ struct kref refcount;
+ struct rcu_head rcu;
+};
+
+DECLARE_EWMA(throughput, 1024, 8)
+
+/**
+ * struct batadv_hardif_neigh_node_bat_v - B.A.T.M.A.N. V private neighbor
+ * information
+ * @throughput: ewma link throughput towards this neighbor
+ * @elp_interval: time interval between two ELP transmissions
+ * @elp_latest_seqno: latest and best known ELP sequence number
+ * @last_unicast_tx: when the last unicast packet has been sent to this neighbor
+ * @metric_work: work queue callback item for metric update
+ */
+struct batadv_hardif_neigh_node_bat_v {
+ struct ewma_throughput throughput;
+ u32 elp_interval;
+ u32 elp_latest_seqno;
+ unsigned long last_unicast_tx;
+ struct work_struct metric_work;
+};
+
+/**
+ * struct batadv_hardif_neigh_node - unique neighbor per hard-interface
+ * @list: list node for batadv_hard_iface::neigh_list
+ * @addr: the MAC address of the neighboring interface
+ * @if_incoming: pointer to incoming hard-interface
+ * @last_seen: when last packet via this neighbor was received
+ * @bat_v: B.A.T.M.A.N. V private data
+ * @refcount: number of contexts the object is used
+ * @rcu: struct used for freeing in a RCU-safe manner
+ */
+struct batadv_hardif_neigh_node {
+ struct hlist_node list;
+ u8 addr[ETH_ALEN];
+ struct batadv_hard_iface *if_incoming;
+ unsigned long last_seen;
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+ struct batadv_hardif_neigh_node_bat_v bat_v;
+#endif
+ struct kref refcount;
struct rcu_head rcu;
};
@@ -347,11 +431,10 @@ struct batadv_gw_node {
* @addr: the MAC address of the neighboring interface
* @ifinfo_list: list for routing metrics per outgoing interface
* @ifinfo_lock: lock protecting private ifinfo members and list
- * @if_incoming: pointer to incoming hard interface
+ * @if_incoming: pointer to incoming hard-interface
* @last_seen: when last packet via this neighbor was received
- * @last_ttl: last received ttl from this neigh node
+ * @refcount: number of contexts the object is used
* @rcu: struct used for freeing in an RCU-safe manner
- * @bat_iv: B.A.T.M.A.N. IV private structure
*/
struct batadv_neigh_node {
struct hlist_node list;
@@ -361,13 +444,13 @@ struct batadv_neigh_node {
spinlock_t ifinfo_lock; /* protects ifinfo_list and its members */
struct batadv_hard_iface *if_incoming;
unsigned long last_seen;
- atomic_t refcount;
+ struct kref refcount;
struct rcu_head rcu;
};
/**
* struct batadv_neigh_ifinfo_bat_iv - neighbor information per outgoing
- * interface for BATMAN IV
+ * interface for B.A.T.M.A.N. IV
* @tq_recv: ring buffer of received TQ values from this neigh node
* @tq_index: ring buffer index
* @tq_avg: averaged tq of all tq values in the ring buffer (tq_recv)
@@ -384,10 +467,22 @@ struct batadv_neigh_ifinfo_bat_iv {
};
/**
+ * struct batadv_neigh_ifinfo_bat_v - neighbor information per outgoing
+ * interface for B.A.T.M.A.N. V
+ * @throughput: last throughput metric received from originator via this neigh
+ * @last_seqno: last sequence number known for this neighbor
+ */
+struct batadv_neigh_ifinfo_bat_v {
+ u32 throughput;
+ u32 last_seqno;
+};
+
+/**
* struct batadv_neigh_ifinfo - neighbor information per outgoing interface
* @list: list node for batadv_neigh_node::ifinfo_list
- * @if_outgoing: pointer to outgoing hard interface
+ * @if_outgoing: pointer to outgoing hard-interface
* @bat_iv: B.A.T.M.A.N. IV private structure
+ * @bat_v: B.A.T.M.A.N. V private data
* @last_ttl: last received ttl from this neigh node
* @refcount: number of contexts the object is used
* @rcu: struct used for freeing in a RCU-safe manner
@@ -396,18 +491,22 @@ struct batadv_neigh_ifinfo {
struct hlist_node list;
struct batadv_hard_iface *if_outgoing;
struct batadv_neigh_ifinfo_bat_iv bat_iv;
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+ struct batadv_neigh_ifinfo_bat_v bat_v;
+#endif
u8 last_ttl;
- atomic_t refcount;
+ struct kref refcount;
struct rcu_head rcu;
};
+#ifdef CONFIG_BATMAN_ADV_BLA
+
/**
* struct batadv_bcast_duplist_entry - structure for LAN broadcast suppression
- * @orig[ETH_ALEN]: mac address of orig node orginating the broadcast
+ * @orig: mac address of orig node orginating the broadcast
* @crc: crc32 checksum of broadcast payload
* @entrytime: time when the broadcast packet was received
*/
-#ifdef CONFIG_BATMAN_ADV_BLA
struct batadv_bcast_duplist_entry {
u8 orig[ETH_ALEN];
__be32 crc;
@@ -549,9 +648,11 @@ struct batadv_priv_tt {
struct delayed_work work;
};
+#ifdef CONFIG_BATMAN_ADV_BLA
+
/**
* struct batadv_priv_bla - per mesh interface bridge loope avoidance data
- * @num_requests; number of bla requests in flight
+ * @num_requests: number of bla requests in flight
* @claim_hash: hash table containing mesh nodes this host has claimed
* @backbone_hash: hash table containing all detected backbone gateways
* @bcast_duplist: recently received broadcast packets array (for broadcast
@@ -561,7 +662,6 @@ struct batadv_priv_tt {
* @claim_dest: local claim data (e.g. claim group)
* @work: work queue callback item for cleanups & bla announcements
*/
-#ifdef CONFIG_BATMAN_ADV_BLA
struct batadv_priv_bla {
atomic_t num_requests;
struct batadv_hashtable *claim_hash;
@@ -575,6 +675,8 @@ struct batadv_priv_bla {
};
#endif
+#ifdef CONFIG_BATMAN_ADV_DEBUG
+
/**
* struct batadv_priv_debug_log - debug logging data
* @log_buff: buffer holding the logs (ring bufer)
@@ -583,7 +685,6 @@ struct batadv_priv_bla {
* @lock: lock protecting log_buff, log_start & log_end
* @queue_wait: log reader's wait queue
*/
-#ifdef CONFIG_BATMAN_ADV_DEBUG
struct batadv_priv_debug_log {
char log_buff[BATADV_LOG_BUF_LEN];
unsigned long log_start;
@@ -625,13 +726,14 @@ struct batadv_priv_tvlv {
spinlock_t handler_list_lock; /* protects handler_list */
};
+#ifdef CONFIG_BATMAN_ADV_DAT
+
/**
* struct batadv_priv_dat - per mesh interface DAT private data
* @addr: node DAT address
* @hash: hashtable representing the local ARP cache
* @work: work queue callback item for cache purging
*/
-#ifdef CONFIG_BATMAN_ADV_DAT
struct batadv_priv_dat {
batadv_dat_addr_t addr;
struct batadv_hashtable *hash;
@@ -719,11 +821,25 @@ struct batadv_softif_vlan {
atomic_t ap_isolation; /* boolean */
struct batadv_vlan_tt tt;
struct hlist_node list;
- atomic_t refcount;
+ struct kref refcount;
struct rcu_head rcu;
};
/**
+ * struct batadv_priv_bat_v - B.A.T.M.A.N. V per soft-interface private data
+ * @ogm_buff: buffer holding the OGM packet
+ * @ogm_buff_len: length of the OGM packet buffer
+ * @ogm_seqno: OGM sequence number - used to identify each OGM
+ * @ogm_wq: workqueue used to schedule OGM transmissions
+ */
+struct batadv_priv_bat_v {
+ unsigned char *ogm_buff;
+ int ogm_buff_len;
+ atomic_t ogm_seqno;
+ struct delayed_work ogm_wq;
+};
+
+/**
* struct batadv_priv - per mesh interface data
* @mesh_state: current status of the mesh (inactive/active/deactivating)
* @soft_iface: net device which holds this struct as private data
@@ -746,6 +862,9 @@ struct batadv_softif_vlan {
* @orig_interval: OGM broadcast interval in milliseconds
* @hop_penalty: penalty which will be applied to an OGM's tq-field on every hop
* @log_level: configured log level (see batadv_dbg_level)
+ * @isolation_mark: the skb->mark value used to match packets for AP isolation
+ * @isolation_mark_mask: bitmask identifying the bits in skb->mark to be used
+ * for the isolation mark
* @bcast_seqno: last sent broadcast packet sequence number
* @bcast_queue_left: number of remaining buffered broadcast packet slots
* @batman_queue_left: number of remaining OGM packet slots
@@ -758,8 +877,8 @@ struct batadv_softif_vlan {
* @forw_bat_list_lock: lock protecting forw_bat_list
* @forw_bcast_list_lock: lock protecting forw_bcast_list
* @orig_work: work queue callback item for orig node purging
- * @cleanup_work: work queue callback item for soft interface deinit
- * @primary_if: one of the hard interfaces assigned to this mesh interface
+ * @cleanup_work: work queue callback item for soft-interface deinit
+ * @primary_if: one of the hard-interfaces assigned to this mesh interface
* becomes the primary interface
* @bat_algo_ops: routing algorithm used by this mesh interface
* @softif_vlan_list: a list of softif_vlan structs, one per VLAN created on top
@@ -773,7 +892,8 @@ struct batadv_softif_vlan {
* @dat: distributed arp table data
* @mcast: multicast data
* @network_coding: bool indicating whether network coding is enabled
- * @batadv_priv_nc: network coding data
+ * @nc: network coding data
+ * @bat_v: B.A.T.M.A.N. V per soft-interface private data
*/
struct batadv_priv {
atomic_t mesh_state;
@@ -839,6 +959,9 @@ struct batadv_priv {
atomic_t network_coding;
struct batadv_priv_nc nc;
#endif /* CONFIG_BATMAN_ADV_NC */
+#ifdef CONFIG_BATMAN_ADV_BATMAN_V
+ struct batadv_priv_bat_v bat_v;
+#endif
};
/**
@@ -871,6 +994,8 @@ struct batadv_socket_packet {
u8 icmp_packet[BATADV_ICMP_MAX_PACKET_SIZE];
};
+#ifdef CONFIG_BATMAN_ADV_BLA
+
/**
* struct batadv_bla_backbone_gw - batman-adv gateway bridged into the LAN
* @orig: originator address of backbone node (mac address of primary iface)
@@ -884,10 +1009,10 @@ struct batadv_socket_packet {
* backbone gateway - no bcast traffic is formwared until the situation was
* resolved
* @crc: crc16 checksum over all claims
+ * @crc_lock: lock protecting crc
* @refcount: number of contexts the object is used
* @rcu: struct used for freeing in an RCU-safe manner
*/
-#ifdef CONFIG_BATMAN_ADV_BLA
struct batadv_bla_backbone_gw {
u8 orig[ETH_ALEN];
unsigned short vid;
@@ -897,7 +1022,8 @@ struct batadv_bla_backbone_gw {
atomic_t wait_periods;
atomic_t request_sent;
u16 crc;
- atomic_t refcount;
+ spinlock_t crc_lock; /* protects crc */
+ struct kref refcount;
struct rcu_head rcu;
};
@@ -905,7 +1031,7 @@ struct batadv_bla_backbone_gw {
* struct batadv_bla_claim - claimed non-mesh client structure
* @addr: mac address of claimed non-mesh client
* @vid: vlan id this client was detected on
- * @batadv_bla_backbone_gw: pointer to backbone gw claiming this client
+ * @backbone_gw: pointer to backbone gw claiming this client
* @lasttime: last time we heard of claim (locals only)
* @hash_entry: hlist node for batadv_priv_bla::claim_hash
* @refcount: number of contexts the object is used
@@ -918,7 +1044,7 @@ struct batadv_bla_claim {
unsigned long lasttime;
struct hlist_node hash_entry;
struct rcu_head rcu;
- atomic_t refcount;
+ struct kref refcount;
};
#endif
@@ -939,7 +1065,7 @@ struct batadv_tt_common_entry {
struct hlist_node hash_entry;
u16 flags;
unsigned long added_at;
- atomic_t refcount;
+ struct kref refcount;
struct rcu_head rcu;
};
@@ -981,7 +1107,7 @@ struct batadv_tt_orig_list_entry {
struct batadv_orig_node *orig_node;
u8 ttvn;
struct hlist_node list;
- atomic_t refcount;
+ struct kref refcount;
struct rcu_head rcu;
};
@@ -1034,7 +1160,7 @@ struct batadv_tt_roam_node {
struct batadv_nc_node {
struct list_head list;
u8 addr[ETH_ALEN];
- atomic_t refcount;
+ struct kref refcount;
struct rcu_head rcu;
struct batadv_orig_node *orig_node;
unsigned long last_seen;
@@ -1054,7 +1180,7 @@ struct batadv_nc_node {
struct batadv_nc_path {
struct hlist_node hash_entry;
struct rcu_head rcu;
- atomic_t refcount;
+ struct kref refcount;
struct list_head packet_list;
spinlock_t packet_list_lock; /* Protects packet_list */
u8 next_hop[ETH_ALEN];
@@ -1131,11 +1257,13 @@ struct batadv_forw_packet {
* @bat_primary_iface_set: called when primary interface is selected / changed
* @bat_ogm_schedule: prepare a new outgoing OGM for the send queue
* @bat_ogm_emit: send scheduled OGM
+ * @bat_hardif_neigh_init: called on creation of single hop entry
* @bat_neigh_cmp: compare the metrics of two neighbors for their respective
* outgoing interfaces
- * @bat_neigh_is_equiv_or_better: check if neigh1 is equally good or better
- * than neigh2 for their respective outgoing interface from the metric
+ * @bat_neigh_is_similar_or_better: check if neigh1 is equally similar or
+ * better than neigh2 for their respective outgoing interface from the metric
* prospective
+ * @bat_neigh_print: print the single hop neighbor list (optional)
* @bat_neigh_free: free the resources allocated by the routing algorithm for a
* neigh_node object
* @bat_orig_print: print the originator table (optional)
@@ -1156,15 +1284,17 @@ struct batadv_algo_ops {
void (*bat_ogm_schedule)(struct batadv_hard_iface *hard_iface);
void (*bat_ogm_emit)(struct batadv_forw_packet *forw_packet);
/* neigh_node handling API */
+ void (*bat_hardif_neigh_init)(struct batadv_hardif_neigh_node *neigh);
int (*bat_neigh_cmp)(struct batadv_neigh_node *neigh1,
struct batadv_hard_iface *if_outgoing1,
struct batadv_neigh_node *neigh2,
struct batadv_hard_iface *if_outgoing2);
- bool (*bat_neigh_is_equiv_or_better)
+ bool (*bat_neigh_is_similar_or_better)
(struct batadv_neigh_node *neigh1,
struct batadv_hard_iface *if_outgoing1,
struct batadv_neigh_node *neigh2,
struct batadv_hard_iface *if_outgoing2);
+ void (*bat_neigh_print)(struct batadv_priv *priv, struct seq_file *seq);
void (*bat_neigh_free)(struct batadv_neigh_node *neigh);
/* orig_node handling API */
void (*bat_orig_print)(struct batadv_priv *priv, struct seq_file *seq,
@@ -1193,7 +1323,7 @@ struct batadv_dat_entry {
unsigned short vid;
unsigned long last_update;
struct hlist_node hash_entry;
- atomic_t refcount;
+ struct kref refcount;
struct rcu_head rcu;
};
@@ -1224,14 +1354,12 @@ struct batadv_dat_candidate {
* struct batadv_tvlv_container - container for tvlv appended to OGMs
* @list: hlist node for batadv_priv_tvlv::container_list
* @tvlv_hdr: tvlv header information needed to construct the tvlv
- * @value_len: length of the buffer following this struct which contains
- * the actual tvlv payload
* @refcount: number of contexts the object is used
*/
struct batadv_tvlv_container {
struct hlist_node list;
struct batadv_tvlv_hdr tvlv_hdr;
- atomic_t refcount;
+ struct kref refcount;
};
/**
@@ -1258,7 +1386,7 @@ struct batadv_tvlv_handler {
u8 type;
u8 version;
u8 flags;
- atomic_t refcount;
+ struct kref refcount;
struct rcu_head rcu;
};
diff --git a/net/bluetooth/6lowpan.c b/net/bluetooth/6lowpan.c
index 9e9cca3689a0..8a4cc2f7f0db 100644
--- a/net/bluetooth/6lowpan.c
+++ b/net/bluetooth/6lowpan.c
@@ -307,6 +307,9 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
/* check that it's our buffer */
if (lowpan_is_ipv6(*skb_network_header(skb))) {
+ /* Pull off the 1-byte of 6lowpan header. */
+ skb_pull(skb, 1);
+
/* Copy the packet so that the IPv6 header is
* properly aligned.
*/
@@ -317,6 +320,7 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
local_skb->protocol = htons(ETH_P_IPV6);
local_skb->pkt_type = PACKET_HOST;
+ local_skb->dev = dev;
skb_set_transport_header(local_skb, sizeof(struct ipv6hdr));
@@ -335,6 +339,8 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
if (!local_skb)
goto drop;
+ local_skb->dev = dev;
+
ret = iphc_decompress(local_skb, dev, chan);
if (ret < 0) {
kfree_skb(local_skb);
@@ -343,7 +349,6 @@ static int recv_pkt(struct sk_buff *skb, struct net_device *dev,
local_skb->protocol = htons(ETH_P_IPV6);
local_skb->pkt_type = PACKET_HOST;
- local_skb->dev = dev;
if (give_skb_to_upper(local_skb, dev)
!= NET_RX_SUCCESS) {
@@ -825,9 +830,7 @@ static int setup_netdev(struct l2cap_chan *chan, struct lowpan_dev **dev)
list_add_rcu(&(*dev)->list, &bt_6lowpan_devices);
spin_unlock(&devices_lock);
- lowpan_netdev_setup(netdev, LOWPAN_LLTYPE_BTLE);
-
- err = register_netdev(netdev);
+ err = lowpan_register_netdev(netdev, LOWPAN_LLTYPE_BTLE);
if (err < 0) {
BT_INFO("register_netdev failed %d", err);
spin_lock(&devices_lock);
@@ -890,7 +893,7 @@ static void delete_netdev(struct work_struct *work)
struct lowpan_dev *entry = container_of(work, struct lowpan_dev,
delete_netdev);
- unregister_netdev(entry->netdev);
+ lowpan_unregister_netdev(entry->netdev);
/* The entry pointer is deleted by the netdev destructor. */
}
@@ -1348,7 +1351,7 @@ static void disconnect_devices(void)
ifdown(entry->netdev);
BT_DBG("Unregistering netdev %s %p",
entry->netdev->name, entry->netdev);
- unregister_netdev(entry->netdev);
+ lowpan_unregister_netdev(entry->netdev);
kfree(entry);
}
}
diff --git a/net/bluetooth/Kconfig b/net/bluetooth/Kconfig
index 95d1a66ba03a..06c31b9a68b0 100644
--- a/net/bluetooth/Kconfig
+++ b/net/bluetooth/Kconfig
@@ -69,6 +69,15 @@ config BT_6LOWPAN
help
IPv6 compression over Bluetooth Low Energy.
+config BT_LEDS
+ bool "Enable LED triggers"
+ depends on BT
+ depends on LEDS_CLASS
+ select LEDS_TRIGGERS
+ help
+ This option selects a few LED triggers for different
+ Bluetooth events.
+
config BT_SELFTEST
bool "Bluetooth self testing support"
depends on BT && DEBUG_KERNEL
diff --git a/net/bluetooth/Makefile b/net/bluetooth/Makefile
index 2b15ae8c1def..b3ff12eb9b6d 100644
--- a/net/bluetooth/Makefile
+++ b/net/bluetooth/Makefile
@@ -17,6 +17,7 @@ bluetooth-y := af_bluetooth.o hci_core.o hci_conn.o hci_event.o mgmt.o \
bluetooth-$(CONFIG_BT_BREDR) += sco.o
bluetooth-$(CONFIG_BT_HS) += a2mp.o amp.o
+bluetooth-$(CONFIG_BT_LEDS) += leds.o
bluetooth-$(CONFIG_BT_DEBUGFS) += hci_debugfs.o
bluetooth-$(CONFIG_BT_SELFTEST) += selftest.o
diff --git a/net/bluetooth/af_bluetooth.c b/net/bluetooth/af_bluetooth.c
index a3bffd1ec2b4..955eda93e66f 100644
--- a/net/bluetooth/af_bluetooth.c
+++ b/net/bluetooth/af_bluetooth.c
@@ -33,8 +33,6 @@
#include "selftest.h"
-#define VERSION "2.21"
-
/* Bluetooth sockets */
#define BT_MAX_PROTO 8
static const struct net_proto_family *bt_proto[BT_MAX_PROTO];
@@ -176,20 +174,20 @@ EXPORT_SYMBOL(bt_accept_unlink);
struct sock *bt_accept_dequeue(struct sock *parent, struct socket *newsock)
{
- struct list_head *p, *n;
+ struct bt_sock *s, *n;
struct sock *sk;
BT_DBG("parent %p", parent);
- list_for_each_safe(p, n, &bt_sk(parent)->accept_q) {
- sk = (struct sock *) list_entry(p, struct bt_sock, accept_q);
+ list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) {
+ sk = (struct sock *)s;
lock_sock(sk);
/* FIXME: Is this check still needed */
if (sk->sk_state == BT_CLOSED) {
- release_sock(sk);
bt_accept_unlink(sk);
+ release_sock(sk);
continue;
}
@@ -271,11 +269,11 @@ static long bt_sock_data_wait(struct sock *sk, long timeo)
if (signal_pending(current) || !timeo)
break;
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
release_sock(sk);
timeo = schedule_timeout(timeo);
lock_sock(sk);
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
}
__set_current_state(TASK_RUNNING);
@@ -390,11 +388,11 @@ EXPORT_SYMBOL(bt_sock_stream_recvmsg);
static inline unsigned int bt_accept_poll(struct sock *parent)
{
- struct list_head *p, *n;
+ struct bt_sock *s, *n;
struct sock *sk;
- list_for_each_safe(p, n, &bt_sk(parent)->accept_q) {
- sk = (struct sock *) list_entry(p, struct bt_sock, accept_q);
+ list_for_each_entry_safe(s, n, &bt_sk(parent)->accept_q, accept_q) {
+ sk = (struct sock *)s;
if (sk->sk_state == BT_CONNECTED ||
(test_bit(BT_SK_DEFER_SETUP, &bt_sk(parent)->flags) &&
sk->sk_state == BT_CONNECT2))
@@ -441,7 +439,7 @@ unsigned int bt_sock_poll(struct file *file, struct socket *sock,
if (!test_bit(BT_SK_SUSPEND, &bt_sk(sk)->flags) && sock_writeable(sk))
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
else
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
return mask;
}
@@ -671,7 +669,7 @@ static const struct file_operations bt_fops = {
};
int bt_procfs_init(struct net *net, const char *name,
- struct bt_sock_list* sk_list,
+ struct bt_sock_list *sk_list,
int (* seq_show)(struct seq_file *, void *))
{
sk_list->custom_seq_show = seq_show;
@@ -687,7 +685,7 @@ void bt_procfs_cleanup(struct net *net, const char *name)
}
#else
int bt_procfs_init(struct net *net, const char *name,
- struct bt_sock_list* sk_list,
+ struct bt_sock_list *sk_list,
int (* seq_show)(struct seq_file *, void *))
{
return 0;
@@ -715,7 +713,7 @@ static int __init bt_init(void)
sock_skb_cb_check_size(sizeof(struct bt_skb_cb));
- BT_INFO("Core ver %s", VERSION);
+ BT_INFO("Core ver %s", BT_SUBSYS_VERSION);
err = bt_selftest();
if (err < 0)
@@ -789,7 +787,7 @@ subsys_initcall(bt_init);
module_exit(bt_exit);
MODULE_AUTHOR("Marcel Holtmann <marcel@holtmann.org>");
-MODULE_DESCRIPTION("Bluetooth Core ver " VERSION);
-MODULE_VERSION(VERSION);
+MODULE_DESCRIPTION("Bluetooth Core ver " BT_SUBSYS_VERSION);
+MODULE_VERSION(BT_SUBSYS_VERSION);
MODULE_LICENSE("GPL");
MODULE_ALIAS_NETPROTO(PF_BLUETOOTH);
diff --git a/net/bluetooth/bnep/core.c b/net/bluetooth/bnep/core.c
index 1641367e54ca..fbf251fef70f 100644
--- a/net/bluetooth/bnep/core.c
+++ b/net/bluetooth/bnep/core.c
@@ -608,8 +608,11 @@ int bnep_add_connection(struct bnep_connadd_req *req, struct socket *sock)
s->msg.msg_flags = MSG_NOSIGNAL;
#ifdef CONFIG_BT_BNEP_MC_FILTER
- /* Set default mc filter */
- set_bit(bnep_mc_hash(dev->broadcast), (ulong *) &s->mc_filter);
+ /* Set default mc filter to not filter out any mc addresses
+ * as defined in the BNEP specification (revision 0.95a)
+ * http://grouper.ieee.org/groups/802/15/Bluetooth/BNEP.pdf
+ */
+ s->mc_filter = ~0LL;
#endif
#ifdef CONFIG_BT_BNEP_PROTO_FILTER
diff --git a/net/bluetooth/cmtp/capi.c b/net/bluetooth/cmtp/capi.c
index 9a50338772f3..46ac686c8911 100644
--- a/net/bluetooth/cmtp/capi.c
+++ b/net/bluetooth/cmtp/capi.c
@@ -100,10 +100,8 @@ static void cmtp_application_del(struct cmtp_session *session, struct cmtp_appli
static struct cmtp_application *cmtp_application_get(struct cmtp_session *session, int pattern, __u16 value)
{
struct cmtp_application *app;
- struct list_head *p;
- list_for_each(p, &session->applications) {
- app = list_entry(p, struct cmtp_application, list);
+ list_for_each_entry(app, &session->applications, list) {
switch (pattern) {
case CMTP_MSGNUM:
if (app->msgnum == value)
@@ -511,14 +509,12 @@ static int cmtp_proc_show(struct seq_file *m, void *v)
struct capi_ctr *ctrl = m->private;
struct cmtp_session *session = ctrl->driverdata;
struct cmtp_application *app;
- struct list_head *p;
seq_printf(m, "%s\n\n", cmtp_procinfo(ctrl));
seq_printf(m, "addr %s\n", session->name);
seq_printf(m, "ctrl %d\n", session->num);
- list_for_each(p, &session->applications) {
- app = list_entry(p, struct cmtp_application, list);
+ list_for_each_entry(app, &session->applications, list) {
seq_printf(m, "appl %d -> %d\n", app->appl, app->mapping);
}
diff --git a/net/bluetooth/cmtp/core.c b/net/bluetooth/cmtp/core.c
index 298ed37010e6..9e59b6654126 100644
--- a/net/bluetooth/cmtp/core.c
+++ b/net/bluetooth/cmtp/core.c
@@ -178,8 +178,7 @@ static inline int cmtp_recv_frame(struct cmtp_session *session, struct sk_buff *
cmtp_add_msgpart(session, id, skb->data + hdrlen, len);
break;
default:
- if (session->reassembly[id] != NULL)
- kfree_skb(session->reassembly[id]);
+ kfree_skb(session->reassembly[id]);
session->reassembly[id] = NULL;
break;
}
diff --git a/net/bluetooth/hci_conn.c b/net/bluetooth/hci_conn.c
index 85b82f7adbd2..bf9f8a801a2e 100644
--- a/net/bluetooth/hci_conn.c
+++ b/net/bluetooth/hci_conn.c
@@ -178,6 +178,10 @@ static void hci_connect_le_scan_remove(struct hci_conn *conn)
hci_dev_hold(conn->hdev);
hci_conn_get(conn);
+ /* Even though we hold a reference to the hdev, many other
+ * things might get cleaned up meanwhile, including the hdev's
+ * own workqueue, so we can't use that for scheduling.
+ */
schedule_work(&conn->le_scan_cleanup);
}
@@ -664,8 +668,16 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status)
conn->state = BT_CLOSED;
- mgmt_connect_failed(hdev, &conn->dst, conn->type, conn->dst_type,
- status);
+ /* If the status indicates successful cancellation of
+ * the attempt (i.e. Unkown Connection Id) there's no point of
+ * notifying failure since we'll go back to keep trying to
+ * connect. The only exception is explicit connect requests
+ * where a timeout + cancel does indicate an actual failure.
+ */
+ if (status != HCI_ERROR_UNKNOWN_CONN_ID ||
+ (params && params->explicit_connect))
+ mgmt_connect_failed(hdev, &conn->dst, conn->type,
+ conn->dst_type, status);
hci_connect_cfm(conn, status);
@@ -679,7 +691,7 @@ void hci_le_conn_failed(struct hci_conn *conn, u8 status)
/* Re-enable advertising in case this was a failed connection
* attempt as a peripheral.
*/
- mgmt_reenable_advertising(hdev);
+ hci_req_reenable_advertising(hdev);
}
static void create_le_conn_complete(struct hci_dev *hdev, u8 status, u16 opcode)
@@ -707,6 +719,13 @@ done:
hci_dev_unlock(hdev);
}
+static bool conn_use_rpa(struct hci_conn *conn)
+{
+ struct hci_dev *hdev = conn->hdev;
+
+ return hci_dev_test_flag(hdev, HCI_PRIVACY);
+}
+
static void hci_req_add_le_create_conn(struct hci_request *req,
struct hci_conn *conn)
{
@@ -714,16 +733,21 @@ static void hci_req_add_le_create_conn(struct hci_request *req,
struct hci_dev *hdev = conn->hdev;
u8 own_addr_type;
- memset(&cp, 0, sizeof(cp));
-
/* Update random address, but set require_privacy to false so
* that we never connect with an non-resolvable address.
*/
- if (hci_update_random_address(req, false, &own_addr_type))
+ if (hci_update_random_address(req, false, conn_use_rpa(conn),
+ &own_addr_type))
return;
+ memset(&cp, 0, sizeof(cp));
+
+ /* Set window to be the same value as the interval to enable
+ * continuous scanning.
+ */
cp.scan_interval = cpu_to_le16(hdev->le_scan_interval);
- cp.scan_window = cpu_to_le16(hdev->le_scan_window);
+ cp.scan_window = cp.scan_interval;
+
bacpy(&cp.peer_addr, &conn->dst);
cp.peer_addr_type = conn->dst_type;
cp.own_address_type = own_addr_type;
@@ -758,7 +782,8 @@ static void hci_req_directed_advertising(struct hci_request *req,
/* Set require_privacy to false so that the remote device has a
* chance of identifying us.
*/
- if (hci_update_random_address(req, false, &own_addr_type) < 0)
+ if (hci_update_random_address(req, false, conn_use_rpa(conn),
+ &own_addr_type) < 0)
return;
memset(&cp, 0, sizeof(cp));
@@ -781,7 +806,7 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
u8 role)
{
struct hci_conn_params *params;
- struct hci_conn *conn, *conn_unfinished;
+ struct hci_conn *conn;
struct smp_irk *irk;
struct hci_request req;
int err;
@@ -794,35 +819,22 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
return ERR_PTR(-EOPNOTSUPP);
}
- /* Some devices send ATT messages as soon as the physical link is
- * established. To be able to handle these ATT messages, the user-
- * space first establishes the connection and then starts the pairing
- * process.
- *
- * So if a hci_conn object already exists for the following connection
- * attempt, we simply update pending_sec_level and auth_type fields
- * and return the object found.
- */
- conn = hci_conn_hash_lookup_le(hdev, dst, dst_type);
- conn_unfinished = NULL;
- if (conn) {
- if (conn->state == BT_CONNECT &&
- test_bit(HCI_CONN_SCANNING, &conn->flags)) {
- BT_DBG("will continue unfinished conn %pMR", dst);
- conn_unfinished = conn;
- } else {
- if (conn->pending_sec_level < sec_level)
- conn->pending_sec_level = sec_level;
- goto done;
- }
- }
-
/* Since the controller supports only one LE connection attempt at a
* time, we return -EBUSY if there is any connection attempt running.
*/
if (hci_lookup_le_connect(hdev))
return ERR_PTR(-EBUSY);
+ /* If there's already a connection object but it's not in
+ * scanning state it means it must already be established, in
+ * which case we can't do anything else except report a failure
+ * to connect.
+ */
+ conn = hci_conn_hash_lookup_le(hdev, dst, dst_type);
+ if (conn && !test_bit(HCI_CONN_SCANNING, &conn->flags)) {
+ return ERR_PTR(-EBUSY);
+ }
+
/* When given an identity address with existing identity
* resolving key, the connection needs to be established
* to a resolvable random address.
@@ -838,23 +850,20 @@ struct hci_conn *hci_connect_le(struct hci_dev *hdev, bdaddr_t *dst,
dst_type = ADDR_LE_DEV_RANDOM;
}
- if (conn_unfinished) {
- conn = conn_unfinished;
+ if (conn) {
bacpy(&conn->dst, dst);
} else {
conn = hci_conn_add(hdev, LE_LINK, dst, role);
+ if (!conn)
+ return ERR_PTR(-ENOMEM);
+ hci_conn_hold(conn);
+ conn->pending_sec_level = sec_level;
}
- if (!conn)
- return ERR_PTR(-ENOMEM);
-
conn->dst_type = dst_type;
conn->sec_level = BT_SECURITY_LOW;
conn->conn_timeout = conn_timeout;
- if (!conn_unfinished)
- conn->pending_sec_level = sec_level;
-
hci_req_init(&req, hdev);
/* Disable advertising if we're active. For master role
@@ -918,37 +927,9 @@ create_conn:
return ERR_PTR(err);
}
-done:
- /* If this is continuation of connect started by hci_connect_le_scan,
- * it already called hci_conn_hold and calling it again would mess the
- * counter.
- */
- if (!conn_unfinished)
- hci_conn_hold(conn);
-
return conn;
}
-static void hci_connect_le_scan_complete(struct hci_dev *hdev, u8 status,
- u16 opcode)
-{
- struct hci_conn *conn;
-
- if (!status)
- return;
-
- BT_ERR("Failed to add device to auto conn whitelist: status 0x%2.2x",
- status);
-
- hci_dev_lock(hdev);
-
- conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
- if (conn)
- hci_le_conn_failed(conn, status);
-
- hci_dev_unlock(hdev);
-}
-
static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type)
{
struct hci_conn *conn;
@@ -964,10 +945,9 @@ static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type)
}
/* This function requires the caller holds hdev->lock */
-static int hci_explicit_conn_params_set(struct hci_request *req,
+static int hci_explicit_conn_params_set(struct hci_dev *hdev,
bdaddr_t *addr, u8 addr_type)
{
- struct hci_dev *hdev = req->hdev;
struct hci_conn_params *params;
if (is_connected(hdev, addr, addr_type))
@@ -995,7 +975,6 @@ static int hci_explicit_conn_params_set(struct hci_request *req,
}
params->explicit_connect = true;
- __hci_update_background_scan(req);
BT_DBG("addr %pMR (type %u) auto_connect %u", addr, addr_type,
params->auto_connect);
@@ -1006,11 +985,9 @@ static int hci_explicit_conn_params_set(struct hci_request *req,
/* This function requires the caller holds hdev->lock */
struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
u8 dst_type, u8 sec_level,
- u16 conn_timeout, u8 role)
+ u16 conn_timeout)
{
struct hci_conn *conn;
- struct hci_request req;
- int err;
/* Let's make sure that le is enabled.*/
if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
@@ -1038,29 +1015,22 @@ struct hci_conn *hci_connect_le_scan(struct hci_dev *hdev, bdaddr_t *dst,
BT_DBG("requesting refresh of dst_addr");
- conn = hci_conn_add(hdev, LE_LINK, dst, role);
+ conn = hci_conn_add(hdev, LE_LINK, dst, HCI_ROLE_MASTER);
if (!conn)
return ERR_PTR(-ENOMEM);
- hci_req_init(&req, hdev);
-
- if (hci_explicit_conn_params_set(&req, dst, dst_type) < 0)
+ if (hci_explicit_conn_params_set(hdev, dst, dst_type) < 0)
return ERR_PTR(-EBUSY);
conn->state = BT_CONNECT;
set_bit(HCI_CONN_SCANNING, &conn->flags);
-
- err = hci_req_run(&req, hci_connect_le_scan_complete);
- if (err && err != -ENODATA) {
- hci_conn_del(conn);
- return ERR_PTR(err);
- }
-
conn->dst_type = dst_type;
conn->sec_level = BT_SECURITY_LOW;
conn->pending_sec_level = sec_level;
conn->conn_timeout = conn_timeout;
+ hci_update_background_scan(hdev);
+
done:
hci_conn_hold(conn);
return conn;
diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
index 83a6aacfab31..2713fc86e85a 100644
--- a/net/bluetooth/hci_core.c
+++ b/net/bluetooth/hci_core.c
@@ -40,6 +40,7 @@
#include "hci_request.h"
#include "hci_debugfs.h"
#include "smp.h"
+#include "leds.h"
static void hci_rx_work(struct work_struct *work);
static void hci_cmd_work(struct work_struct *work);
@@ -56,15 +57,6 @@ DEFINE_MUTEX(hci_cb_list_lock);
/* HCI ID Numbering */
static DEFINE_IDA(hci_index_ida);
-/* ----- HCI requests ----- */
-
-#define HCI_REQ_DONE 0
-#define HCI_REQ_PEND 1
-#define HCI_REQ_CANCELED 2
-
-#define hci_req_lock(d) mutex_lock(&d->req_lock)
-#define hci_req_unlock(d) mutex_unlock(&d->req_lock)
-
/* ---- HCI debugfs entries ---- */
static ssize_t dut_mode_read(struct file *file, char __user *user_buf,
@@ -73,7 +65,7 @@ static ssize_t dut_mode_read(struct file *file, char __user *user_buf,
struct hci_dev *hdev = file->private_data;
char buf[3];
- buf[0] = hci_dev_test_flag(hdev, HCI_DUT_MODE) ? 'Y': 'N';
+ buf[0] = hci_dev_test_flag(hdev, HCI_DUT_MODE) ? 'Y' : 'N';
buf[1] = '\n';
buf[2] = '\0';
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
@@ -101,14 +93,14 @@ static ssize_t dut_mode_write(struct file *file, const char __user *user_buf,
if (enable == hci_dev_test_flag(hdev, HCI_DUT_MODE))
return -EALREADY;
- hci_req_lock(hdev);
+ hci_req_sync_lock(hdev);
if (enable)
skb = __hci_cmd_sync(hdev, HCI_OP_ENABLE_DUT_MODE, 0, NULL,
HCI_CMD_TIMEOUT);
else
skb = __hci_cmd_sync(hdev, HCI_OP_RESET, 0, NULL,
HCI_CMD_TIMEOUT);
- hci_req_unlock(hdev);
+ hci_req_sync_unlock(hdev);
if (IS_ERR(skb))
return PTR_ERR(skb);
@@ -133,7 +125,7 @@ static ssize_t vendor_diag_read(struct file *file, char __user *user_buf,
struct hci_dev *hdev = file->private_data;
char buf[3];
- buf[0] = hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) ? 'Y': 'N';
+ buf[0] = hci_dev_test_flag(hdev, HCI_VENDOR_DIAG) ? 'Y' : 'N';
buf[1] = '\n';
buf[2] = '\0';
return simple_read_from_buffer(user_buf, count, ppos, buf, 2);
@@ -165,9 +157,9 @@ static ssize_t vendor_diag_write(struct file *file, const char __user *user_buf,
!test_bit(HCI_RUNNING, &hdev->flags))
goto done;
- hci_req_lock(hdev);
+ hci_req_sync_lock(hdev);
err = hdev->set_diag(hdev, enable);
- hci_req_unlock(hdev);
+ hci_req_sync_unlock(hdev);
if (err < 0)
return err;
@@ -198,197 +190,14 @@ static void hci_debugfs_create_basic(struct hci_dev *hdev)
&vendor_diag_fops);
}
-/* ---- HCI requests ---- */
-
-static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode,
- struct sk_buff *skb)
-{
- BT_DBG("%s result 0x%2.2x", hdev->name, result);
-
- if (hdev->req_status == HCI_REQ_PEND) {
- hdev->req_result = result;
- hdev->req_status = HCI_REQ_DONE;
- if (skb)
- hdev->req_skb = skb_get(skb);
- wake_up_interruptible(&hdev->req_wait_q);
- }
-}
-
-static void hci_req_cancel(struct hci_dev *hdev, int err)
-{
- BT_DBG("%s err 0x%2.2x", hdev->name, err);
-
- if (hdev->req_status == HCI_REQ_PEND) {
- hdev->req_result = err;
- hdev->req_status = HCI_REQ_CANCELED;
- wake_up_interruptible(&hdev->req_wait_q);
- }
-}
-
-struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
- const void *param, u8 event, u32 timeout)
-{
- DECLARE_WAITQUEUE(wait, current);
- struct hci_request req;
- struct sk_buff *skb;
- int err = 0;
-
- BT_DBG("%s", hdev->name);
-
- hci_req_init(&req, hdev);
-
- hci_req_add_ev(&req, opcode, plen, param, event);
-
- hdev->req_status = HCI_REQ_PEND;
-
- add_wait_queue(&hdev->req_wait_q, &wait);
- set_current_state(TASK_INTERRUPTIBLE);
-
- err = hci_req_run_skb(&req, hci_req_sync_complete);
- if (err < 0) {
- remove_wait_queue(&hdev->req_wait_q, &wait);
- set_current_state(TASK_RUNNING);
- return ERR_PTR(err);
- }
-
- schedule_timeout(timeout);
-
- remove_wait_queue(&hdev->req_wait_q, &wait);
-
- if (signal_pending(current))
- return ERR_PTR(-EINTR);
-
- switch (hdev->req_status) {
- case HCI_REQ_DONE:
- err = -bt_to_errno(hdev->req_result);
- break;
-
- case HCI_REQ_CANCELED:
- err = -hdev->req_result;
- break;
-
- default:
- err = -ETIMEDOUT;
- break;
- }
-
- hdev->req_status = hdev->req_result = 0;
- skb = hdev->req_skb;
- hdev->req_skb = NULL;
-
- BT_DBG("%s end: err %d", hdev->name, err);
-
- if (err < 0) {
- kfree_skb(skb);
- return ERR_PTR(err);
- }
-
- if (!skb)
- return ERR_PTR(-ENODATA);
-
- return skb;
-}
-EXPORT_SYMBOL(__hci_cmd_sync_ev);
-
-struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen,
- const void *param, u32 timeout)
-{
- return __hci_cmd_sync_ev(hdev, opcode, plen, param, 0, timeout);
-}
-EXPORT_SYMBOL(__hci_cmd_sync);
-
-/* Execute request and wait for completion. */
-static int __hci_req_sync(struct hci_dev *hdev,
- void (*func)(struct hci_request *req,
- unsigned long opt),
- unsigned long opt, __u32 timeout)
-{
- struct hci_request req;
- DECLARE_WAITQUEUE(wait, current);
- int err = 0;
-
- BT_DBG("%s start", hdev->name);
-
- hci_req_init(&req, hdev);
-
- hdev->req_status = HCI_REQ_PEND;
-
- func(&req, opt);
-
- add_wait_queue(&hdev->req_wait_q, &wait);
- set_current_state(TASK_INTERRUPTIBLE);
-
- err = hci_req_run_skb(&req, hci_req_sync_complete);
- if (err < 0) {
- hdev->req_status = 0;
-
- remove_wait_queue(&hdev->req_wait_q, &wait);
- set_current_state(TASK_RUNNING);
-
- /* ENODATA means the HCI request command queue is empty.
- * This can happen when a request with conditionals doesn't
- * trigger any commands to be sent. This is normal behavior
- * and should not trigger an error return.
- */
- if (err == -ENODATA)
- return 0;
-
- return err;
- }
-
- schedule_timeout(timeout);
-
- remove_wait_queue(&hdev->req_wait_q, &wait);
-
- if (signal_pending(current))
- return -EINTR;
-
- switch (hdev->req_status) {
- case HCI_REQ_DONE:
- err = -bt_to_errno(hdev->req_result);
- break;
-
- case HCI_REQ_CANCELED:
- err = -hdev->req_result;
- break;
-
- default:
- err = -ETIMEDOUT;
- break;
- }
-
- hdev->req_status = hdev->req_result = 0;
-
- BT_DBG("%s end: err %d", hdev->name, err);
-
- return err;
-}
-
-static int hci_req_sync(struct hci_dev *hdev,
- void (*req)(struct hci_request *req,
- unsigned long opt),
- unsigned long opt, __u32 timeout)
-{
- int ret;
-
- if (!test_bit(HCI_UP, &hdev->flags))
- return -ENETDOWN;
-
- /* Serialize all requests */
- hci_req_lock(hdev);
- ret = __hci_req_sync(hdev, req, opt, timeout);
- hci_req_unlock(hdev);
-
- return ret;
-}
-
-static void hci_reset_req(struct hci_request *req, unsigned long opt)
+static int hci_reset_req(struct hci_request *req, unsigned long opt)
{
BT_DBG("%s %ld", req->hdev->name, opt);
/* Reset device */
set_bit(HCI_RESET, &req->hdev->flags);
hci_req_add(req, HCI_OP_RESET, 0, NULL);
+ return 0;
}
static void bredr_init(struct hci_request *req)
@@ -428,7 +237,7 @@ static void amp_init1(struct hci_request *req)
hci_req_add(req, HCI_OP_READ_LOCATION_DATA, 0, NULL);
}
-static void amp_init2(struct hci_request *req)
+static int amp_init2(struct hci_request *req)
{
/* Read Local Supported Features. Not all AMP controllers
* support this so it's placed conditionally in the second
@@ -436,9 +245,11 @@ static void amp_init2(struct hci_request *req)
*/
if (req->hdev->commands[14] & 0x20)
hci_req_add(req, HCI_OP_READ_LOCAL_FEATURES, 0, NULL);
+
+ return 0;
}
-static void hci_init1_req(struct hci_request *req, unsigned long opt)
+static int hci_init1_req(struct hci_request *req, unsigned long opt)
{
struct hci_dev *hdev = req->hdev;
@@ -461,6 +272,8 @@ static void hci_init1_req(struct hci_request *req, unsigned long opt)
BT_ERR("Unknown device type %d", hdev->dev_type);
break;
}
+
+ return 0;
}
static void bredr_setup(struct hci_request *req)
@@ -508,12 +321,6 @@ static void le_setup(struct hci_request *req)
/* Read LE Supported States */
hci_req_add(req, HCI_OP_LE_READ_SUPPORTED_STATES, 0, NULL);
- /* Read LE White List Size */
- hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE, 0, NULL);
-
- /* Clear LE White List */
- hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL);
-
/* LE-only controllers have LE implicitly enabled */
if (!lmp_bredr_capable(hdev))
hci_dev_set_flag(hdev, HCI_LE_ENABLED);
@@ -537,20 +344,30 @@ static void hci_setup_event_mask(struct hci_request *req)
if (lmp_bredr_capable(hdev)) {
events[4] |= 0x01; /* Flow Specification Complete */
- events[4] |= 0x02; /* Inquiry Result with RSSI */
- events[4] |= 0x04; /* Read Remote Extended Features Complete */
- events[5] |= 0x08; /* Synchronous Connection Complete */
- events[5] |= 0x10; /* Synchronous Connection Changed */
} else {
/* Use a different default for LE-only devices */
memset(events, 0, sizeof(events));
- events[0] |= 0x10; /* Disconnection Complete */
- events[1] |= 0x08; /* Read Remote Version Information Complete */
events[1] |= 0x20; /* Command Complete */
events[1] |= 0x40; /* Command Status */
events[1] |= 0x80; /* Hardware Error */
- events[2] |= 0x04; /* Number of Completed Packets */
- events[3] |= 0x02; /* Data Buffer Overflow */
+
+ /* If the controller supports the Disconnect command, enable
+ * the corresponding event. In addition enable packet flow
+ * control related events.
+ */
+ if (hdev->commands[0] & 0x20) {
+ events[0] |= 0x10; /* Disconnection Complete */
+ events[2] |= 0x04; /* Number of Completed Packets */
+ events[3] |= 0x02; /* Data Buffer Overflow */
+ }
+
+ /* If the controller supports the Read Remote Version
+ * Information command, enable the corresponding event.
+ */
+ if (hdev->commands[2] & 0x80)
+ events[1] |= 0x08; /* Read Remote Version Information
+ * Complete
+ */
if (hdev->le_features[0] & HCI_LE_ENCRYPTION) {
events[0] |= 0x80; /* Encryption Change */
@@ -558,9 +375,18 @@ static void hci_setup_event_mask(struct hci_request *req)
}
}
- if (lmp_inq_rssi_capable(hdev))
+ if (lmp_inq_rssi_capable(hdev) ||
+ test_bit(HCI_QUIRK_FIXUP_INQUIRY_MODE, &hdev->quirks))
events[4] |= 0x02; /* Inquiry Result with RSSI */
+ if (lmp_ext_feat_capable(hdev))
+ events[4] |= 0x04; /* Read Remote Extended Features Complete */
+
+ if (lmp_esco_capable(hdev)) {
+ events[5] |= 0x08; /* Synchronous Connection Complete */
+ events[5] |= 0x10; /* Synchronous Connection Changed */
+ }
+
if (lmp_sniffsubr_capable(hdev))
events[5] |= 0x20; /* Sniff Subrating */
@@ -596,7 +422,7 @@ static void hci_setup_event_mask(struct hci_request *req)
hci_req_add(req, HCI_OP_SET_EVENT_MASK, sizeof(events), events);
}
-static void hci_init2_req(struct hci_request *req, unsigned long opt)
+static int hci_init2_req(struct hci_request *req, unsigned long opt)
{
struct hci_dev *hdev = req->hdev;
@@ -676,6 +502,8 @@ static void hci_init2_req(struct hci_request *req, unsigned long opt)
hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE, sizeof(enable),
&enable);
}
+
+ return 0;
}
static void hci_setup_link_policy(struct hci_request *req)
@@ -750,7 +578,7 @@ static void hci_set_event_mask_page_2(struct hci_request *req)
hci_req_add(req, HCI_OP_SET_EVENT_MASK_PAGE_2, sizeof(events), events);
}
-static void hci_init3_req(struct hci_request *req, unsigned long opt)
+static int hci_init3_req(struct hci_request *req, unsigned long opt)
{
struct hci_dev *hdev = req->hdev;
u8 p;
@@ -783,7 +611,6 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt)
u8 events[8];
memset(events, 0, sizeof(events));
- events[0] = 0x0f;
if (hdev->le_features[0] & HCI_LE_ENCRYPTION)
events[0] |= 0x10; /* LE Long Term Key Request */
@@ -810,6 +637,34 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt)
* Report
*/
+ /* If the controller supports the LE Set Scan Enable command,
+ * enable the corresponding advertising report event.
+ */
+ if (hdev->commands[26] & 0x08)
+ events[0] |= 0x02; /* LE Advertising Report */
+
+ /* If the controller supports the LE Create Connection
+ * command, enable the corresponding event.
+ */
+ if (hdev->commands[26] & 0x10)
+ events[0] |= 0x01; /* LE Connection Complete */
+
+ /* If the controller supports the LE Connection Update
+ * command, enable the corresponding event.
+ */
+ if (hdev->commands[27] & 0x04)
+ events[0] |= 0x04; /* LE Connection Update
+ * Complete
+ */
+
+ /* If the controller supports the LE Read Remote Used Features
+ * command, enable the corresponding event.
+ */
+ if (hdev->commands[27] & 0x20)
+ events[0] |= 0x08; /* LE Read Remote Used
+ * Features Complete
+ */
+
/* If the controller supports the LE Read Local P-256
* Public Key command, enable the corresponding event.
*/
@@ -832,6 +687,17 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt)
hci_req_add(req, HCI_OP_LE_READ_ADV_TX_POWER, 0, NULL);
}
+ if (hdev->commands[26] & 0x40) {
+ /* Read LE White List Size */
+ hci_req_add(req, HCI_OP_LE_READ_WHITE_LIST_SIZE,
+ 0, NULL);
+ }
+
+ if (hdev->commands[26] & 0x80) {
+ /* Clear LE White List */
+ hci_req_add(req, HCI_OP_LE_CLEAR_WHITE_LIST, 0, NULL);
+ }
+
if (hdev->le_features[0] & HCI_LE_DATA_LEN_EXT) {
/* Read LE Maximum Data Length */
hci_req_add(req, HCI_OP_LE_READ_MAX_DATA_LEN, 0, NULL);
@@ -851,9 +717,11 @@ static void hci_init3_req(struct hci_request *req, unsigned long opt)
hci_req_add(req, HCI_OP_READ_LOCAL_EXT_FEATURES,
sizeof(cp), &cp);
}
+
+ return 0;
}
-static void hci_init4_req(struct hci_request *req, unsigned long opt)
+static int hci_init4_req(struct hci_request *req, unsigned long opt)
{
struct hci_dev *hdev = req->hdev;
@@ -904,20 +772,22 @@ static void hci_init4_req(struct hci_request *req, unsigned long opt)
hci_req_add(req, HCI_OP_WRITE_SC_SUPPORT,
sizeof(support), &support);
}
+
+ return 0;
}
static int __hci_init(struct hci_dev *hdev)
{
int err;
- err = __hci_req_sync(hdev, hci_init1_req, 0, HCI_INIT_TIMEOUT);
+ err = __hci_req_sync(hdev, hci_init1_req, 0, HCI_INIT_TIMEOUT, NULL);
if (err < 0)
return err;
if (hci_dev_test_flag(hdev, HCI_SETUP))
hci_debugfs_create_basic(hdev);
- err = __hci_req_sync(hdev, hci_init2_req, 0, HCI_INIT_TIMEOUT);
+ err = __hci_req_sync(hdev, hci_init2_req, 0, HCI_INIT_TIMEOUT, NULL);
if (err < 0)
return err;
@@ -928,11 +798,11 @@ static int __hci_init(struct hci_dev *hdev)
if (hdev->dev_type != HCI_BREDR)
return 0;
- err = __hci_req_sync(hdev, hci_init3_req, 0, HCI_INIT_TIMEOUT);
+ err = __hci_req_sync(hdev, hci_init3_req, 0, HCI_INIT_TIMEOUT, NULL);
if (err < 0)
return err;
- err = __hci_req_sync(hdev, hci_init4_req, 0, HCI_INIT_TIMEOUT);
+ err = __hci_req_sync(hdev, hci_init4_req, 0, HCI_INIT_TIMEOUT, NULL);
if (err < 0)
return err;
@@ -963,7 +833,7 @@ static int __hci_init(struct hci_dev *hdev)
return 0;
}
-static void hci_init0_req(struct hci_request *req, unsigned long opt)
+static int hci_init0_req(struct hci_request *req, unsigned long opt)
{
struct hci_dev *hdev = req->hdev;
@@ -979,6 +849,8 @@ static void hci_init0_req(struct hci_request *req, unsigned long opt)
/* Read BD Address */
if (hdev->set_bdaddr)
hci_req_add(req, HCI_OP_READ_BD_ADDR, 0, NULL);
+
+ return 0;
}
static int __hci_unconf_init(struct hci_dev *hdev)
@@ -988,7 +860,7 @@ static int __hci_unconf_init(struct hci_dev *hdev)
if (test_bit(HCI_QUIRK_RAW_DEVICE, &hdev->quirks))
return 0;
- err = __hci_req_sync(hdev, hci_init0_req, 0, HCI_INIT_TIMEOUT);
+ err = __hci_req_sync(hdev, hci_init0_req, 0, HCI_INIT_TIMEOUT, NULL);
if (err < 0)
return err;
@@ -998,7 +870,7 @@ static int __hci_unconf_init(struct hci_dev *hdev)
return 0;
}
-static void hci_scan_req(struct hci_request *req, unsigned long opt)
+static int hci_scan_req(struct hci_request *req, unsigned long opt)
{
__u8 scan = opt;
@@ -1006,9 +878,10 @@ static void hci_scan_req(struct hci_request *req, unsigned long opt)
/* Inquiry and Page scans */
hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
+ return 0;
}
-static void hci_auth_req(struct hci_request *req, unsigned long opt)
+static int hci_auth_req(struct hci_request *req, unsigned long opt)
{
__u8 auth = opt;
@@ -1016,9 +889,10 @@ static void hci_auth_req(struct hci_request *req, unsigned long opt)
/* Authentication */
hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE, 1, &auth);
+ return 0;
}
-static void hci_encrypt_req(struct hci_request *req, unsigned long opt)
+static int hci_encrypt_req(struct hci_request *req, unsigned long opt)
{
__u8 encrypt = opt;
@@ -1026,9 +900,10 @@ static void hci_encrypt_req(struct hci_request *req, unsigned long opt)
/* Encryption */
hci_req_add(req, HCI_OP_WRITE_ENCRYPT_MODE, 1, &encrypt);
+ return 0;
}
-static void hci_linkpol_req(struct hci_request *req, unsigned long opt)
+static int hci_linkpol_req(struct hci_request *req, unsigned long opt)
{
__le16 policy = cpu_to_le16(opt);
@@ -1036,6 +911,7 @@ static void hci_linkpol_req(struct hci_request *req, unsigned long opt)
/* Default link policy */
hci_req_add(req, HCI_OP_WRITE_DEF_LINK_POLICY, 2, &policy);
+ return 0;
}
/* Get HCI device by index.
@@ -1280,7 +1156,7 @@ static int inquiry_cache_dump(struct hci_dev *hdev, int num, __u8 *buf)
return copied;
}
-static void hci_inq_req(struct hci_request *req, unsigned long opt)
+static int hci_inq_req(struct hci_request *req, unsigned long opt)
{
struct hci_inquiry_req *ir = (struct hci_inquiry_req *) opt;
struct hci_dev *hdev = req->hdev;
@@ -1289,13 +1165,15 @@ static void hci_inq_req(struct hci_request *req, unsigned long opt)
BT_DBG("%s", hdev->name);
if (test_bit(HCI_INQUIRY, &hdev->flags))
- return;
+ return 0;
/* Start Inquiry */
memcpy(&cp.lap, &ir->lap, 3);
cp.length = ir->length;
cp.num_rsp = ir->num_rsp;
hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp);
+
+ return 0;
}
int hci_inquiry(void __user *arg)
@@ -1346,7 +1224,7 @@ int hci_inquiry(void __user *arg)
if (do_inquiry) {
err = hci_req_sync(hdev, hci_inq_req, (unsigned long) &ir,
- timeo);
+ timeo, NULL);
if (err < 0)
goto done;
@@ -1399,7 +1277,7 @@ static int hci_dev_do_open(struct hci_dev *hdev)
BT_DBG("%s %p", hdev->name, hdev);
- hci_req_lock(hdev);
+ hci_req_sync_lock(hdev);
if (hci_dev_test_flag(hdev, HCI_UNREGISTER)) {
ret = -ENODEV;
@@ -1518,14 +1396,15 @@ static int hci_dev_do_open(struct hci_dev *hdev)
hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
set_bit(HCI_UP, &hdev->flags);
hci_sock_dev_event(hdev, HCI_DEV_UP);
+ hci_leds_update_powered(hdev, true);
if (!hci_dev_test_flag(hdev, HCI_SETUP) &&
!hci_dev_test_flag(hdev, HCI_CONFIG) &&
!hci_dev_test_flag(hdev, HCI_UNCONFIGURED) &&
!hci_dev_test_flag(hdev, HCI_USER_CHANNEL) &&
+ hci_dev_test_flag(hdev, HCI_MGMT) &&
hdev->dev_type == HCI_BREDR) {
- hci_dev_lock(hdev);
- mgmt_powered(hdev, 1);
- hci_dev_unlock(hdev);
+ ret = __hci_req_hci_power_on(hdev);
+ mgmt_power_on(hdev, ret);
}
} else {
/* Init failed, cleanup */
@@ -1552,7 +1431,7 @@ static int hci_dev_do_open(struct hci_dev *hdev)
}
done:
- hci_req_unlock(hdev);
+ hci_req_sync_unlock(hdev);
return ret;
}
@@ -1646,21 +1525,22 @@ int hci_dev_do_close(struct hci_dev *hdev)
cancel_delayed_work(&hdev->power_off);
- hci_req_cancel(hdev, ENODEV);
- hci_req_lock(hdev);
+ hci_request_cancel_all(hdev);
+ hci_req_sync_lock(hdev);
if (!test_and_clear_bit(HCI_UP, &hdev->flags)) {
cancel_delayed_work_sync(&hdev->cmd_timer);
- hci_req_unlock(hdev);
+ hci_req_sync_unlock(hdev);
return 0;
}
+ hci_leds_update_powered(hdev, false);
+
/* Flush RX and TX works */
flush_work(&hdev->tx_work);
flush_work(&hdev->rx_work);
if (hdev->discov_timeout > 0) {
- cancel_delayed_work(&hdev->discov_off);
hdev->discov_timeout = 0;
hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
@@ -1669,17 +1549,9 @@ int hci_dev_do_close(struct hci_dev *hdev)
if (hci_dev_test_and_clear_flag(hdev, HCI_SERVICE_CACHE))
cancel_delayed_work(&hdev->service_cache);
- cancel_delayed_work_sync(&hdev->le_scan_disable);
- cancel_delayed_work_sync(&hdev->le_scan_restart);
-
if (hci_dev_test_flag(hdev, HCI_MGMT))
cancel_delayed_work_sync(&hdev->rpa_expired);
- if (hdev->adv_instance_timeout) {
- cancel_delayed_work_sync(&hdev->adv_instance_expire);
- hdev->adv_instance_timeout = 0;
- }
-
/* Avoid potential lockdep warnings from the *_flush() calls by
* ensuring the workqueue is empty up front.
*/
@@ -1691,8 +1563,9 @@ int hci_dev_do_close(struct hci_dev *hdev)
auto_off = hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF);
- if (!auto_off && hdev->dev_type == HCI_BREDR)
- mgmt_powered(hdev, 0);
+ if (!auto_off && hdev->dev_type == HCI_BREDR &&
+ hci_dev_test_flag(hdev, HCI_MGMT))
+ __mgmt_power_off(hdev);
hci_inquiry_cache_flush(hdev);
hci_pend_le_actions_clear(hdev);
@@ -1712,7 +1585,7 @@ int hci_dev_do_close(struct hci_dev *hdev)
if (test_bit(HCI_QUIRK_RESET_ON_CLOSE, &hdev->quirks) &&
!auto_off && !hci_dev_test_flag(hdev, HCI_UNCONFIGURED)) {
set_bit(HCI_INIT, &hdev->flags);
- __hci_req_sync(hdev, hci_reset_req, 0, HCI_CMD_TIMEOUT);
+ __hci_req_sync(hdev, hci_reset_req, 0, HCI_CMD_TIMEOUT, NULL);
clear_bit(HCI_INIT, &hdev->flags);
}
@@ -1749,7 +1622,7 @@ int hci_dev_do_close(struct hci_dev *hdev)
memset(hdev->dev_class, 0, sizeof(hdev->dev_class));
bacpy(&hdev->random_addr, BDADDR_ANY);
- hci_req_unlock(hdev);
+ hci_req_sync_unlock(hdev);
hci_dev_put(hdev);
return 0;
@@ -1785,7 +1658,7 @@ static int hci_dev_do_reset(struct hci_dev *hdev)
BT_DBG("%s %p", hdev->name, hdev);
- hci_req_lock(hdev);
+ hci_req_sync_lock(hdev);
/* Drop queues */
skb_queue_purge(&hdev->rx_q);
@@ -1807,9 +1680,9 @@ static int hci_dev_do_reset(struct hci_dev *hdev)
atomic_set(&hdev->cmd_cnt, 1);
hdev->acl_cnt = 0; hdev->sco_cnt = 0; hdev->le_cnt = 0;
- ret = __hci_req_sync(hdev, hci_reset_req, 0, HCI_INIT_TIMEOUT);
+ ret = __hci_req_sync(hdev, hci_reset_req, 0, HCI_INIT_TIMEOUT, NULL);
- hci_req_unlock(hdev);
+ hci_req_sync_unlock(hdev);
return ret;
}
@@ -1900,7 +1773,7 @@ static void hci_update_scan_state(struct hci_dev *hdev, u8 scan)
hci_dev_set_flag(hdev, HCI_BREDR_ENABLED);
if (hci_dev_test_flag(hdev, HCI_LE_ENABLED))
- mgmt_update_adv_data(hdev);
+ hci_req_update_adv_data(hdev, hdev->cur_adv_instance);
mgmt_new_settings(hdev);
}
@@ -1942,7 +1815,7 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
switch (cmd) {
case HCISETAUTH:
err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt,
- HCI_INIT_TIMEOUT);
+ HCI_INIT_TIMEOUT, NULL);
break;
case HCISETENCRYPT:
@@ -1954,18 +1827,18 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
if (!test_bit(HCI_AUTH, &hdev->flags)) {
/* Auth must be enabled first */
err = hci_req_sync(hdev, hci_auth_req, dr.dev_opt,
- HCI_INIT_TIMEOUT);
+ HCI_INIT_TIMEOUT, NULL);
if (err)
break;
}
err = hci_req_sync(hdev, hci_encrypt_req, dr.dev_opt,
- HCI_INIT_TIMEOUT);
+ HCI_INIT_TIMEOUT, NULL);
break;
case HCISETSCAN:
err = hci_req_sync(hdev, hci_scan_req, dr.dev_opt,
- HCI_INIT_TIMEOUT);
+ HCI_INIT_TIMEOUT, NULL);
/* Ensure that the connectable and discoverable states
* get correctly modified as this was a non-mgmt change.
@@ -1976,7 +1849,7 @@ int hci_dev_cmd(unsigned int cmd, void __user *arg)
case HCISETLINKPOL:
err = hci_req_sync(hdev, hci_linkpol_req, dr.dev_opt,
- HCI_INIT_TIMEOUT);
+ HCI_INIT_TIMEOUT, NULL);
break;
case HCISETLINKMODE:
@@ -2145,6 +2018,17 @@ static void hci_power_on(struct work_struct *work)
BT_DBG("%s", hdev->name);
+ if (test_bit(HCI_UP, &hdev->flags) &&
+ hci_dev_test_flag(hdev, HCI_MGMT) &&
+ hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) {
+ cancel_delayed_work(&hdev->power_off);
+ hci_req_sync_lock(hdev);
+ err = __hci_req_hci_power_on(hdev);
+ hci_req_sync_unlock(hdev);
+ mgmt_power_on(hdev, err);
+ return;
+ }
+
err = hci_dev_do_open(hdev);
if (err < 0) {
hci_dev_lock(hdev);
@@ -2227,28 +2111,6 @@ static void hci_error_reset(struct work_struct *work)
hci_dev_do_open(hdev);
}
-static void hci_discov_off(struct work_struct *work)
-{
- struct hci_dev *hdev;
-
- hdev = container_of(work, struct hci_dev, discov_off.work);
-
- BT_DBG("%s", hdev->name);
-
- mgmt_discoverable_timeout(hdev);
-}
-
-static void hci_adv_timeout_expire(struct work_struct *work)
-{
- struct hci_dev *hdev;
-
- hdev = container_of(work, struct hci_dev, adv_instance_expire.work);
-
- BT_DBG("%s", hdev->name);
-
- mgmt_adv_timeout_expired(hdev);
-}
-
void hci_uuids_clear(struct hci_dev *hdev)
{
struct bt_uuid *uuid, *tmp;
@@ -2726,7 +2588,8 @@ struct adv_info *hci_find_adv_instance(struct hci_dev *hdev, u8 instance)
}
/* This function requires the caller holds hdev->lock */
-struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance) {
+struct adv_info *hci_get_next_instance(struct hci_dev *hdev, u8 instance)
+{
struct adv_info *cur_instance;
cur_instance = hci_find_adv_instance(hdev, instance);
@@ -2752,9 +2615,12 @@ int hci_remove_adv_instance(struct hci_dev *hdev, u8 instance)
BT_DBG("%s removing %dMR", hdev->name, instance);
- if (hdev->cur_adv_instance == instance && hdev->adv_instance_timeout) {
- cancel_delayed_work(&hdev->adv_instance_expire);
- hdev->adv_instance_timeout = 0;
+ if (hdev->cur_adv_instance == instance) {
+ if (hdev->adv_instance_timeout) {
+ cancel_delayed_work(&hdev->adv_instance_expire);
+ hdev->adv_instance_timeout = 0;
+ }
+ hdev->cur_adv_instance = 0x00;
}
list_del(&adv_instance->list);
@@ -2781,6 +2647,7 @@ void hci_adv_instances_clear(struct hci_dev *hdev)
}
hdev->adv_instance_cnt = 0;
+ hdev->cur_adv_instance = 0x00;
}
/* This function requires the caller holds hdev->lock */
@@ -2851,12 +2718,10 @@ struct bdaddr_list *hci_bdaddr_list_lookup(struct list_head *bdaddr_list,
void hci_bdaddr_list_clear(struct list_head *bdaddr_list)
{
- struct list_head *p, *n;
-
- list_for_each_safe(p, n, bdaddr_list) {
- struct bdaddr_list *b = list_entry(p, struct bdaddr_list, list);
+ struct bdaddr_list *b, *n;
- list_del(p);
+ list_for_each_entry_safe(b, n, bdaddr_list, list) {
+ list_del(&b->list);
kfree(b);
}
}
@@ -3019,181 +2884,16 @@ void hci_conn_params_clear_disabled(struct hci_dev *hdev)
}
/* This function requires the caller holds hdev->lock */
-void hci_conn_params_clear_all(struct hci_dev *hdev)
+static void hci_conn_params_clear_all(struct hci_dev *hdev)
{
struct hci_conn_params *params, *tmp;
list_for_each_entry_safe(params, tmp, &hdev->le_conn_params, list)
hci_conn_params_free(params);
- hci_update_background_scan(hdev);
-
BT_DBG("All LE connection parameters were removed");
}
-static void inquiry_complete(struct hci_dev *hdev, u8 status, u16 opcode)
-{
- if (status) {
- BT_ERR("Failed to start inquiry: status %d", status);
-
- hci_dev_lock(hdev);
- hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
- hci_dev_unlock(hdev);
- return;
- }
-}
-
-static void le_scan_disable_work_complete(struct hci_dev *hdev, u8 status,
- u16 opcode)
-{
- /* General inquiry access code (GIAC) */
- u8 lap[3] = { 0x33, 0x8b, 0x9e };
- struct hci_cp_inquiry cp;
- int err;
-
- if (status) {
- BT_ERR("Failed to disable LE scanning: status %d", status);
- return;
- }
-
- hdev->discovery.scan_start = 0;
-
- switch (hdev->discovery.type) {
- case DISCOV_TYPE_LE:
- hci_dev_lock(hdev);
- hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
- hci_dev_unlock(hdev);
- break;
-
- case DISCOV_TYPE_INTERLEAVED:
- hci_dev_lock(hdev);
-
- if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY,
- &hdev->quirks)) {
- /* If we were running LE only scan, change discovery
- * state. If we were running both LE and BR/EDR inquiry
- * simultaneously, and BR/EDR inquiry is already
- * finished, stop discovery, otherwise BR/EDR inquiry
- * will stop discovery when finished. If we will resolve
- * remote device name, do not change discovery state.
- */
- if (!test_bit(HCI_INQUIRY, &hdev->flags) &&
- hdev->discovery.state != DISCOVERY_RESOLVING)
- hci_discovery_set_state(hdev,
- DISCOVERY_STOPPED);
- } else {
- struct hci_request req;
-
- hci_inquiry_cache_flush(hdev);
-
- hci_req_init(&req, hdev);
-
- memset(&cp, 0, sizeof(cp));
- memcpy(&cp.lap, lap, sizeof(cp.lap));
- cp.length = DISCOV_INTERLEAVED_INQUIRY_LEN;
- hci_req_add(&req, HCI_OP_INQUIRY, sizeof(cp), &cp);
-
- err = hci_req_run(&req, inquiry_complete);
- if (err) {
- BT_ERR("Inquiry request failed: err %d", err);
- hci_discovery_set_state(hdev,
- DISCOVERY_STOPPED);
- }
- }
-
- hci_dev_unlock(hdev);
- break;
- }
-}
-
-static void le_scan_disable_work(struct work_struct *work)
-{
- struct hci_dev *hdev = container_of(work, struct hci_dev,
- le_scan_disable.work);
- struct hci_request req;
- int err;
-
- BT_DBG("%s", hdev->name);
-
- cancel_delayed_work_sync(&hdev->le_scan_restart);
-
- hci_req_init(&req, hdev);
-
- hci_req_add_le_scan_disable(&req);
-
- err = hci_req_run(&req, le_scan_disable_work_complete);
- if (err)
- BT_ERR("Disable LE scanning request failed: err %d", err);
-}
-
-static void le_scan_restart_work_complete(struct hci_dev *hdev, u8 status,
- u16 opcode)
-{
- unsigned long timeout, duration, scan_start, now;
-
- BT_DBG("%s", hdev->name);
-
- if (status) {
- BT_ERR("Failed to restart LE scan: status %d", status);
- return;
- }
-
- if (!test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) ||
- !hdev->discovery.scan_start)
- return;
-
- /* When the scan was started, hdev->le_scan_disable has been queued
- * after duration from scan_start. During scan restart this job
- * has been canceled, and we need to queue it again after proper
- * timeout, to make sure that scan does not run indefinitely.
- */
- duration = hdev->discovery.scan_duration;
- scan_start = hdev->discovery.scan_start;
- now = jiffies;
- if (now - scan_start <= duration) {
- int elapsed;
-
- if (now >= scan_start)
- elapsed = now - scan_start;
- else
- elapsed = ULONG_MAX - scan_start + now;
-
- timeout = duration - elapsed;
- } else {
- timeout = 0;
- }
- queue_delayed_work(hdev->workqueue,
- &hdev->le_scan_disable, timeout);
-}
-
-static void le_scan_restart_work(struct work_struct *work)
-{
- struct hci_dev *hdev = container_of(work, struct hci_dev,
- le_scan_restart.work);
- struct hci_request req;
- struct hci_cp_le_set_scan_enable cp;
- int err;
-
- BT_DBG("%s", hdev->name);
-
- /* If controller is not scanning we are done. */
- if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
- return;
-
- hci_req_init(&req, hdev);
-
- hci_req_add_le_scan_disable(&req);
-
- memset(&cp, 0, sizeof(cp));
- cp.enable = LE_SCAN_ENABLE;
- cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
- hci_req_add(&req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
-
- err = hci_req_run(&req, le_scan_restart_work_complete);
- if (err)
- BT_ERR("Restart LE scan request failed: err %d", err);
-}
-
/* Copy the Identity Address of the controller.
*
* If the controller has a public BD_ADDR, then by default use that one.
@@ -3292,10 +2992,6 @@ struct hci_dev *hci_alloc_dev(void)
INIT_WORK(&hdev->error_reset, hci_error_reset);
INIT_DELAYED_WORK(&hdev->power_off, hci_power_off);
- INIT_DELAYED_WORK(&hdev->discov_off, hci_discov_off);
- INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work);
- INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work);
- INIT_DELAYED_WORK(&hdev->adv_instance_expire, hci_adv_timeout_expire);
skb_queue_head_init(&hdev->rx_q);
skb_queue_head_init(&hdev->cmd_q);
@@ -3305,6 +3001,8 @@ struct hci_dev *hci_alloc_dev(void)
INIT_DELAYED_WORK(&hdev->cmd_timer, hci_cmd_timeout);
+ hci_request_setup(hdev);
+
hci_init_sysfs(hdev);
discovery_init(hdev);
@@ -3374,6 +3072,8 @@ int hci_register_dev(struct hci_dev *hdev)
if (error < 0)
goto err_wqueue;
+ hci_leds_init(hdev);
+
hdev->rfkill = rfkill_alloc(hdev->name, &hdev->dev,
RFKILL_TYPE_BLUETOOTH, &hci_rfkill_ops,
hdev);
@@ -3515,7 +3215,7 @@ int hci_reset_dev(struct hci_dev *hdev)
if (!skb)
return -ENOMEM;
- bt_cb(skb)->pkt_type = HCI_EVENT_PKT;
+ hci_skb_pkt_type(skb) = HCI_EVENT_PKT;
memcpy(skb_put(skb, 3), hw_err, 3);
/* Send Hardware Error to upper stack */
@@ -3532,9 +3232,9 @@ int hci_recv_frame(struct hci_dev *hdev, struct sk_buff *skb)
return -ENXIO;
}
- if (bt_cb(skb)->pkt_type != HCI_EVENT_PKT &&
- bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT &&
- bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) {
+ if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) {
kfree_skb(skb);
return -EINVAL;
}
@@ -3556,7 +3256,7 @@ EXPORT_SYMBOL(hci_recv_frame);
int hci_recv_diag(struct hci_dev *hdev, struct sk_buff *skb)
{
/* Mark as diagnostic packet */
- bt_cb(skb)->pkt_type = HCI_DIAG_PKT;
+ hci_skb_pkt_type(skb) = HCI_DIAG_PKT;
/* Time stamp */
__net_timestamp(skb);
@@ -3598,7 +3298,8 @@ static void hci_send_frame(struct hci_dev *hdev, struct sk_buff *skb)
{
int err;
- BT_DBG("%s type %d len %d", hdev->name, bt_cb(skb)->pkt_type, skb->len);
+ BT_DBG("%s type %d len %d", hdev->name, hci_skb_pkt_type(skb),
+ skb->len);
/* Time stamp */
__net_timestamp(skb);
@@ -3643,7 +3344,7 @@ int hci_send_cmd(struct hci_dev *hdev, __u16 opcode, __u32 plen,
/* Stand-alone HCI commands must be flagged as
* single-command requests.
*/
- bt_cb(skb)->hci.req_start = true;
+ bt_cb(skb)->hci.req_flags |= HCI_REQ_START;
skb_queue_tail(&hdev->cmd_q, skb);
queue_work(hdev->workqueue, &hdev->cmd_work);
@@ -3680,9 +3381,9 @@ struct sk_buff *hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen,
bt_dev_dbg(hdev, "opcode 0x%4.4x plen %d", opcode, plen);
- hci_req_lock(hdev);
+ hci_req_sync_lock(hdev);
skb = __hci_cmd_sync(hdev, opcode, plen, param, timeout);
- hci_req_unlock(hdev);
+ hci_req_sync_unlock(hdev);
return skb;
}
@@ -3711,7 +3412,7 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue,
skb->len = skb_headlen(skb);
skb->data_len = 0;
- bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
+ hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT;
switch (hdev->dev_type) {
case HCI_BREDR:
@@ -3751,7 +3452,7 @@ static void hci_queue_acl(struct hci_chan *chan, struct sk_buff_head *queue,
do {
skb = list; list = list->next;
- bt_cb(skb)->pkt_type = HCI_ACLDATA_PKT;
+ hci_skb_pkt_type(skb) = HCI_ACLDATA_PKT;
hci_add_acl_hdr(skb, conn->handle, flags);
BT_DBG("%s frag %p len %d", hdev->name, skb, skb->len);
@@ -3789,7 +3490,7 @@ void hci_send_sco(struct hci_conn *conn, struct sk_buff *skb)
skb_reset_transport_header(skb);
memcpy(skb_transport_header(skb), &hdr, HCI_SCO_HDR_SIZE);
- bt_cb(skb)->pkt_type = HCI_SCODATA_PKT;
+ hci_skb_pkt_type(skb) = HCI_SCODATA_PKT;
skb_queue_tail(&conn->data_q, skb);
queue_work(hdev->workqueue, &hdev->tx_work);
@@ -4340,7 +4041,7 @@ static bool hci_req_is_complete(struct hci_dev *hdev)
if (!skb)
return true;
- return bt_cb(skb)->hci.req_start;
+ return (bt_cb(skb)->hci.req_flags & HCI_REQ_START);
}
static void hci_resend_last(struct hci_dev *hdev)
@@ -4400,26 +4101,28 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
* callback would be found in hdev->sent_cmd instead of the
* command queue (hdev->cmd_q).
*/
- if (bt_cb(hdev->sent_cmd)->hci.req_complete) {
- *req_complete = bt_cb(hdev->sent_cmd)->hci.req_complete;
+ if (bt_cb(hdev->sent_cmd)->hci.req_flags & HCI_REQ_SKB) {
+ *req_complete_skb = bt_cb(hdev->sent_cmd)->hci.req_complete_skb;
return;
}
- if (bt_cb(hdev->sent_cmd)->hci.req_complete_skb) {
- *req_complete_skb = bt_cb(hdev->sent_cmd)->hci.req_complete_skb;
+ if (bt_cb(hdev->sent_cmd)->hci.req_complete) {
+ *req_complete = bt_cb(hdev->sent_cmd)->hci.req_complete;
return;
}
/* Remove all pending commands belonging to this request */
spin_lock_irqsave(&hdev->cmd_q.lock, flags);
while ((skb = __skb_dequeue(&hdev->cmd_q))) {
- if (bt_cb(skb)->hci.req_start) {
+ if (bt_cb(skb)->hci.req_flags & HCI_REQ_START) {
__skb_queue_head(&hdev->cmd_q, skb);
break;
}
- *req_complete = bt_cb(skb)->hci.req_complete;
- *req_complete_skb = bt_cb(skb)->hci.req_complete_skb;
+ if (bt_cb(skb)->hci.req_flags & HCI_REQ_SKB)
+ *req_complete_skb = bt_cb(skb)->hci.req_complete_skb;
+ else
+ *req_complete = bt_cb(skb)->hci.req_complete;
kfree_skb(skb);
}
spin_unlock_irqrestore(&hdev->cmd_q.lock, flags);
@@ -4448,7 +4151,7 @@ static void hci_rx_work(struct work_struct *work)
if (test_bit(HCI_INIT, &hdev->flags)) {
/* Don't process data packets in this states. */
- switch (bt_cb(skb)->pkt_type) {
+ switch (hci_skb_pkt_type(skb)) {
case HCI_ACLDATA_PKT:
case HCI_SCODATA_PKT:
kfree_skb(skb);
@@ -4457,7 +4160,7 @@ static void hci_rx_work(struct work_struct *work)
}
/* Process frame */
- switch (bt_cb(skb)->pkt_type) {
+ switch (hci_skb_pkt_type(skb)) {
case HCI_EVENT_PKT:
BT_DBG("%s Event packet", hdev->name);
hci_event_packet(hdev, skb);
diff --git a/net/bluetooth/hci_event.c b/net/bluetooth/hci_event.c
index d57c11c1c6b5..c162af5d16bf 100644
--- a/net/bluetooth/hci_event.c
+++ b/net/bluetooth/hci_event.c
@@ -1183,7 +1183,7 @@ static void hci_cc_le_set_scan_enable(struct hci_dev *hdev,
hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
else if (!hci_dev_test_flag(hdev, HCI_LE_ADV) &&
hdev->discovery.state == DISCOVERY_FINDING)
- mgmt_reenable_advertising(hdev);
+ hci_req_reenable_advertising(hdev);
break;
@@ -2176,7 +2176,7 @@ static void hci_conn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
hci_send_cmd(hdev, HCI_OP_READ_REMOTE_FEATURES,
sizeof(cp), &cp);
- hci_update_page_scan(hdev);
+ hci_req_update_scan(hdev);
}
/* Set packet type for incoming connection */
@@ -2362,7 +2362,7 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
if (test_bit(HCI_CONN_FLUSH_KEY, &conn->flags))
hci_remove_link_key(hdev, &conn->dst);
- hci_update_page_scan(hdev);
+ hci_req_update_scan(hdev);
}
params = hci_conn_params_lookup(hdev, &conn->dst, conn->dst_type);
@@ -2401,7 +2401,7 @@ static void hci_disconn_complete_evt(struct hci_dev *hdev, struct sk_buff *skb)
* is timed out due to Directed Advertising."
*/
if (type == LE_LINK)
- mgmt_reenable_advertising(hdev);
+ hci_req_reenable_advertising(hdev);
unlock:
hci_dev_unlock(hdev);
@@ -3833,9 +3833,9 @@ static void hci_extended_inquiry_result_evt(struct hci_dev *hdev,
data.ssp_mode = 0x01;
if (hci_dev_test_flag(hdev, HCI_MGMT))
- name_known = eir_has_data_type(info->data,
- sizeof(info->data),
- EIR_NAME_COMPLETE);
+ name_known = eir_get_data(info->data,
+ sizeof(info->data),
+ EIR_NAME_COMPLETE, NULL);
else
name_known = true;
diff --git a/net/bluetooth/hci_request.c b/net/bluetooth/hci_request.c
index 981f8a202c27..6e125d76df0d 100644
--- a/net/bluetooth/hci_request.c
+++ b/net/bluetooth/hci_request.c
@@ -21,12 +21,19 @@
SOFTWARE IS DISCLAIMED.
*/
+#include <asm/unaligned.h>
+
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
+#include <net/bluetooth/mgmt.h>
#include "smp.h"
#include "hci_request.h"
+#define HCI_REQ_DONE 0
+#define HCI_REQ_PEND 1
+#define HCI_REQ_CANCELED 2
+
void hci_req_init(struct hci_request *req, struct hci_dev *hdev)
{
skb_queue_head_init(&req->cmd_q);
@@ -56,8 +63,12 @@ static int req_run(struct hci_request *req, hci_req_complete_t complete,
return -ENODATA;
skb = skb_peek_tail(&req->cmd_q);
- bt_cb(skb)->hci.req_complete = complete;
- bt_cb(skb)->hci.req_complete_skb = complete_skb;
+ if (complete) {
+ bt_cb(skb)->hci.req_complete = complete;
+ } else if (complete_skb) {
+ bt_cb(skb)->hci.req_complete_skb = complete_skb;
+ bt_cb(skb)->hci.req_flags |= HCI_REQ_SKB;
+ }
spin_lock_irqsave(&hdev->cmd_q.lock, flags);
skb_queue_splice_tail(&req->cmd_q, &hdev->cmd_q);
@@ -78,6 +89,203 @@ int hci_req_run_skb(struct hci_request *req, hci_req_complete_skb_t complete)
return req_run(req, NULL, complete);
}
+static void hci_req_sync_complete(struct hci_dev *hdev, u8 result, u16 opcode,
+ struct sk_buff *skb)
+{
+ BT_DBG("%s result 0x%2.2x", hdev->name, result);
+
+ if (hdev->req_status == HCI_REQ_PEND) {
+ hdev->req_result = result;
+ hdev->req_status = HCI_REQ_DONE;
+ if (skb)
+ hdev->req_skb = skb_get(skb);
+ wake_up_interruptible(&hdev->req_wait_q);
+ }
+}
+
+void hci_req_sync_cancel(struct hci_dev *hdev, int err)
+{
+ BT_DBG("%s err 0x%2.2x", hdev->name, err);
+
+ if (hdev->req_status == HCI_REQ_PEND) {
+ hdev->req_result = err;
+ hdev->req_status = HCI_REQ_CANCELED;
+ wake_up_interruptible(&hdev->req_wait_q);
+ }
+}
+
+struct sk_buff *__hci_cmd_sync_ev(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u8 event, u32 timeout)
+{
+ DECLARE_WAITQUEUE(wait, current);
+ struct hci_request req;
+ struct sk_buff *skb;
+ int err = 0;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_req_init(&req, hdev);
+
+ hci_req_add_ev(&req, opcode, plen, param, event);
+
+ hdev->req_status = HCI_REQ_PEND;
+
+ add_wait_queue(&hdev->req_wait_q, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ err = hci_req_run_skb(&req, hci_req_sync_complete);
+ if (err < 0) {
+ remove_wait_queue(&hdev->req_wait_q, &wait);
+ set_current_state(TASK_RUNNING);
+ return ERR_PTR(err);
+ }
+
+ schedule_timeout(timeout);
+
+ remove_wait_queue(&hdev->req_wait_q, &wait);
+
+ if (signal_pending(current))
+ return ERR_PTR(-EINTR);
+
+ switch (hdev->req_status) {
+ case HCI_REQ_DONE:
+ err = -bt_to_errno(hdev->req_result);
+ break;
+
+ case HCI_REQ_CANCELED:
+ err = -hdev->req_result;
+ break;
+
+ default:
+ err = -ETIMEDOUT;
+ break;
+ }
+
+ hdev->req_status = hdev->req_result = 0;
+ skb = hdev->req_skb;
+ hdev->req_skb = NULL;
+
+ BT_DBG("%s end: err %d", hdev->name, err);
+
+ if (err < 0) {
+ kfree_skb(skb);
+ return ERR_PTR(err);
+ }
+
+ if (!skb)
+ return ERR_PTR(-ENODATA);
+
+ return skb;
+}
+EXPORT_SYMBOL(__hci_cmd_sync_ev);
+
+struct sk_buff *__hci_cmd_sync(struct hci_dev *hdev, u16 opcode, u32 plen,
+ const void *param, u32 timeout)
+{
+ return __hci_cmd_sync_ev(hdev, opcode, plen, param, 0, timeout);
+}
+EXPORT_SYMBOL(__hci_cmd_sync);
+
+/* Execute request and wait for completion. */
+int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
+ unsigned long opt),
+ unsigned long opt, u32 timeout, u8 *hci_status)
+{
+ struct hci_request req;
+ DECLARE_WAITQUEUE(wait, current);
+ int err = 0;
+
+ BT_DBG("%s start", hdev->name);
+
+ hci_req_init(&req, hdev);
+
+ hdev->req_status = HCI_REQ_PEND;
+
+ err = func(&req, opt);
+ if (err) {
+ if (hci_status)
+ *hci_status = HCI_ERROR_UNSPECIFIED;
+ return err;
+ }
+
+ add_wait_queue(&hdev->req_wait_q, &wait);
+ set_current_state(TASK_INTERRUPTIBLE);
+
+ err = hci_req_run_skb(&req, hci_req_sync_complete);
+ if (err < 0) {
+ hdev->req_status = 0;
+
+ remove_wait_queue(&hdev->req_wait_q, &wait);
+ set_current_state(TASK_RUNNING);
+
+ /* ENODATA means the HCI request command queue is empty.
+ * This can happen when a request with conditionals doesn't
+ * trigger any commands to be sent. This is normal behavior
+ * and should not trigger an error return.
+ */
+ if (err == -ENODATA) {
+ if (hci_status)
+ *hci_status = 0;
+ return 0;
+ }
+
+ if (hci_status)
+ *hci_status = HCI_ERROR_UNSPECIFIED;
+
+ return err;
+ }
+
+ schedule_timeout(timeout);
+
+ remove_wait_queue(&hdev->req_wait_q, &wait);
+
+ if (signal_pending(current))
+ return -EINTR;
+
+ switch (hdev->req_status) {
+ case HCI_REQ_DONE:
+ err = -bt_to_errno(hdev->req_result);
+ if (hci_status)
+ *hci_status = hdev->req_result;
+ break;
+
+ case HCI_REQ_CANCELED:
+ err = -hdev->req_result;
+ if (hci_status)
+ *hci_status = HCI_ERROR_UNSPECIFIED;
+ break;
+
+ default:
+ err = -ETIMEDOUT;
+ if (hci_status)
+ *hci_status = HCI_ERROR_UNSPECIFIED;
+ break;
+ }
+
+ hdev->req_status = hdev->req_result = 0;
+
+ BT_DBG("%s end: err %d", hdev->name, err);
+
+ return err;
+}
+
+int hci_req_sync(struct hci_dev *hdev, int (*req)(struct hci_request *req,
+ unsigned long opt),
+ unsigned long opt, u32 timeout, u8 *hci_status)
+{
+ int ret;
+
+ if (!test_bit(HCI_UP, &hdev->flags))
+ return -ENETDOWN;
+
+ /* Serialize all requests */
+ hci_req_sync_lock(hdev);
+ ret = __hci_req_sync(hdev, req, opt, timeout, hci_status);
+ hci_req_sync_unlock(hdev);
+
+ return ret;
+}
+
struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen,
const void *param)
{
@@ -98,8 +306,8 @@ struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen,
BT_DBG("skb len %d", skb->len);
- bt_cb(skb)->pkt_type = HCI_COMMAND_PKT;
- bt_cb(skb)->hci.opcode = opcode;
+ hci_skb_pkt_type(skb) = HCI_COMMAND_PKT;
+ hci_skb_opcode(skb) = opcode;
return skb;
}
@@ -128,7 +336,7 @@ void hci_req_add_ev(struct hci_request *req, u16 opcode, u32 plen,
}
if (skb_queue_empty(&req->cmd_q))
- bt_cb(skb)->hci.req_start = true;
+ bt_cb(skb)->hci.req_flags |= HCI_REQ_START;
bt_cb(skb)->hci.req_event = event;
@@ -141,6 +349,311 @@ void hci_req_add(struct hci_request *req, u16 opcode, u32 plen,
hci_req_add_ev(req, opcode, plen, param, 0);
}
+void __hci_req_write_fast_connectable(struct hci_request *req, bool enable)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_write_page_scan_activity acp;
+ u8 type;
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ return;
+
+ if (hdev->hci_ver < BLUETOOTH_VER_1_2)
+ return;
+
+ if (enable) {
+ type = PAGE_SCAN_TYPE_INTERLACED;
+
+ /* 160 msec page scan interval */
+ acp.interval = cpu_to_le16(0x0100);
+ } else {
+ type = PAGE_SCAN_TYPE_STANDARD; /* default */
+
+ /* default 1.28 sec page scan */
+ acp.interval = cpu_to_le16(0x0800);
+ }
+
+ acp.window = cpu_to_le16(0x0012);
+
+ if (__cpu_to_le16(hdev->page_scan_interval) != acp.interval ||
+ __cpu_to_le16(hdev->page_scan_window) != acp.window)
+ hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY,
+ sizeof(acp), &acp);
+
+ if (hdev->page_scan_type != type)
+ hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_TYPE, 1, &type);
+}
+
+/* This function controls the background scanning based on hdev->pend_le_conns
+ * list. If there are pending LE connection we start the background scanning,
+ * otherwise we stop it.
+ *
+ * This function requires the caller holds hdev->lock.
+ */
+static void __hci_update_background_scan(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+
+ if (!test_bit(HCI_UP, &hdev->flags) ||
+ test_bit(HCI_INIT, &hdev->flags) ||
+ hci_dev_test_flag(hdev, HCI_SETUP) ||
+ hci_dev_test_flag(hdev, HCI_CONFIG) ||
+ hci_dev_test_flag(hdev, HCI_AUTO_OFF) ||
+ hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ return;
+
+ /* No point in doing scanning if LE support hasn't been enabled */
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return;
+
+ /* If discovery is active don't interfere with it */
+ if (hdev->discovery.state != DISCOVERY_STOPPED)
+ return;
+
+ /* Reset RSSI and UUID filters when starting background scanning
+ * since these filters are meant for service discovery only.
+ *
+ * The Start Discovery and Start Service Discovery operations
+ * ensure to set proper values for RSSI threshold and UUID
+ * filter list. So it is safe to just reset them here.
+ */
+ hci_discovery_filter_clear(hdev);
+
+ if (list_empty(&hdev->pend_le_conns) &&
+ list_empty(&hdev->pend_le_reports)) {
+ /* If there is no pending LE connections or devices
+ * to be scanned for, we should stop the background
+ * scanning.
+ */
+
+ /* If controller is not scanning we are done. */
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ return;
+
+ hci_req_add_le_scan_disable(req);
+
+ BT_DBG("%s stopping background scanning", hdev->name);
+ } else {
+ /* If there is at least one pending LE connection, we should
+ * keep the background scan running.
+ */
+
+ /* If controller is connecting, we should not start scanning
+ * since some controllers are not able to scan and connect at
+ * the same time.
+ */
+ if (hci_lookup_le_connect(hdev))
+ return;
+
+ /* If controller is currently scanning, we stop it to ensure we
+ * don't miss any advertising (due to duplicates filter).
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ hci_req_add_le_scan_disable(req);
+
+ hci_req_add_le_passive_scan(req);
+
+ BT_DBG("%s starting background scanning", hdev->name);
+ }
+}
+
+void __hci_req_update_name(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_write_local_name cp;
+
+ memcpy(cp.name, hdev->dev_name, sizeof(cp.name));
+
+ hci_req_add(req, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp);
+}
+
+#define PNP_INFO_SVCLASS_ID 0x1200
+
+static u8 *create_uuid16_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
+{
+ u8 *ptr = data, *uuids_start = NULL;
+ struct bt_uuid *uuid;
+
+ if (len < 4)
+ return ptr;
+
+ list_for_each_entry(uuid, &hdev->uuids, list) {
+ u16 uuid16;
+
+ if (uuid->size != 16)
+ continue;
+
+ uuid16 = get_unaligned_le16(&uuid->uuid[12]);
+ if (uuid16 < 0x1100)
+ continue;
+
+ if (uuid16 == PNP_INFO_SVCLASS_ID)
+ continue;
+
+ if (!uuids_start) {
+ uuids_start = ptr;
+ uuids_start[0] = 1;
+ uuids_start[1] = EIR_UUID16_ALL;
+ ptr += 2;
+ }
+
+ /* Stop if not enough space to put next UUID */
+ if ((ptr - data) + sizeof(u16) > len) {
+ uuids_start[1] = EIR_UUID16_SOME;
+ break;
+ }
+
+ *ptr++ = (uuid16 & 0x00ff);
+ *ptr++ = (uuid16 & 0xff00) >> 8;
+ uuids_start[0] += sizeof(uuid16);
+ }
+
+ return ptr;
+}
+
+static u8 *create_uuid32_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
+{
+ u8 *ptr = data, *uuids_start = NULL;
+ struct bt_uuid *uuid;
+
+ if (len < 6)
+ return ptr;
+
+ list_for_each_entry(uuid, &hdev->uuids, list) {
+ if (uuid->size != 32)
+ continue;
+
+ if (!uuids_start) {
+ uuids_start = ptr;
+ uuids_start[0] = 1;
+ uuids_start[1] = EIR_UUID32_ALL;
+ ptr += 2;
+ }
+
+ /* Stop if not enough space to put next UUID */
+ if ((ptr - data) + sizeof(u32) > len) {
+ uuids_start[1] = EIR_UUID32_SOME;
+ break;
+ }
+
+ memcpy(ptr, &uuid->uuid[12], sizeof(u32));
+ ptr += sizeof(u32);
+ uuids_start[0] += sizeof(u32);
+ }
+
+ return ptr;
+}
+
+static u8 *create_uuid128_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
+{
+ u8 *ptr = data, *uuids_start = NULL;
+ struct bt_uuid *uuid;
+
+ if (len < 18)
+ return ptr;
+
+ list_for_each_entry(uuid, &hdev->uuids, list) {
+ if (uuid->size != 128)
+ continue;
+
+ if (!uuids_start) {
+ uuids_start = ptr;
+ uuids_start[0] = 1;
+ uuids_start[1] = EIR_UUID128_ALL;
+ ptr += 2;
+ }
+
+ /* Stop if not enough space to put next UUID */
+ if ((ptr - data) + 16 > len) {
+ uuids_start[1] = EIR_UUID128_SOME;
+ break;
+ }
+
+ memcpy(ptr, uuid->uuid, 16);
+ ptr += 16;
+ uuids_start[0] += 16;
+ }
+
+ return ptr;
+}
+
+static void create_eir(struct hci_dev *hdev, u8 *data)
+{
+ u8 *ptr = data;
+ size_t name_len;
+
+ name_len = strlen(hdev->dev_name);
+
+ if (name_len > 0) {
+ /* EIR Data type */
+ if (name_len > 48) {
+ name_len = 48;
+ ptr[1] = EIR_NAME_SHORT;
+ } else
+ ptr[1] = EIR_NAME_COMPLETE;
+
+ /* EIR Data length */
+ ptr[0] = name_len + 1;
+
+ memcpy(ptr + 2, hdev->dev_name, name_len);
+
+ ptr += (name_len + 2);
+ }
+
+ if (hdev->inq_tx_power != HCI_TX_POWER_INVALID) {
+ ptr[0] = 2;
+ ptr[1] = EIR_TX_POWER;
+ ptr[2] = (u8) hdev->inq_tx_power;
+
+ ptr += 3;
+ }
+
+ if (hdev->devid_source > 0) {
+ ptr[0] = 9;
+ ptr[1] = EIR_DEVICE_ID;
+
+ put_unaligned_le16(hdev->devid_source, ptr + 2);
+ put_unaligned_le16(hdev->devid_vendor, ptr + 4);
+ put_unaligned_le16(hdev->devid_product, ptr + 6);
+ put_unaligned_le16(hdev->devid_version, ptr + 8);
+
+ ptr += 10;
+ }
+
+ ptr = create_uuid16_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
+ ptr = create_uuid32_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
+ ptr = create_uuid128_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
+}
+
+void __hci_req_update_eir(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_write_eir cp;
+
+ if (!hdev_is_powered(hdev))
+ return;
+
+ if (!lmp_ext_inq_capable(hdev))
+ return;
+
+ if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE))
+ return;
+
+ memset(&cp, 0, sizeof(cp));
+
+ create_eir(hdev, cp.data);
+
+ if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0)
+ return;
+
+ memcpy(hdev->eir, cp.data, sizeof(cp.data));
+
+ hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp);
+}
+
void hci_req_add_le_scan_disable(struct hci_request *req)
{
struct hci_cp_le_set_scan_enable cp;
@@ -175,21 +688,29 @@ static u8 update_white_list(struct hci_request *req)
* command to remove it from the controller.
*/
list_for_each_entry(b, &hdev->le_white_list, list) {
- struct hci_cp_le_del_from_white_list cp;
+ /* If the device is neither in pend_le_conns nor
+ * pend_le_reports then remove it from the whitelist.
+ */
+ if (!hci_pend_le_action_lookup(&hdev->pend_le_conns,
+ &b->bdaddr, b->bdaddr_type) &&
+ !hci_pend_le_action_lookup(&hdev->pend_le_reports,
+ &b->bdaddr, b->bdaddr_type)) {
+ struct hci_cp_le_del_from_white_list cp;
+
+ cp.bdaddr_type = b->bdaddr_type;
+ bacpy(&cp.bdaddr, &b->bdaddr);
- if (hci_pend_le_action_lookup(&hdev->pend_le_conns,
- &b->bdaddr, b->bdaddr_type) ||
- hci_pend_le_action_lookup(&hdev->pend_le_reports,
- &b->bdaddr, b->bdaddr_type)) {
- white_list_entries++;
+ hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST,
+ sizeof(cp), &cp);
continue;
}
- cp.bdaddr_type = b->bdaddr_type;
- bacpy(&cp.bdaddr, &b->bdaddr);
+ if (hci_find_irk_by_addr(hdev, &b->bdaddr, b->bdaddr_type)) {
+ /* White list can not be used with RPAs */
+ return 0x00;
+ }
- hci_req_add(req, HCI_OP_LE_DEL_FROM_WHITE_LIST,
- sizeof(cp), &cp);
+ white_list_entries++;
}
/* Since all no longer valid white list entries have been
@@ -250,6 +771,11 @@ static u8 update_white_list(struct hci_request *req)
return 0x01;
}
+static bool scan_use_rpa(struct hci_dev *hdev)
+{
+ return hci_dev_test_flag(hdev, HCI_PRIVACY);
+}
+
void hci_req_add_le_passive_scan(struct hci_request *req)
{
struct hci_cp_le_set_scan_param param_cp;
@@ -264,7 +790,8 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
* advertising with our address will be correctly reported
* by the controller.
*/
- if (hci_update_random_address(req, false, &own_addr_type))
+ if (hci_update_random_address(req, false, scan_use_rpa(hdev),
+ &own_addr_type))
return;
/* Adding or removing entries from the white list must
@@ -302,6 +829,513 @@ void hci_req_add_le_passive_scan(struct hci_request *req)
&enable_cp);
}
+static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev)
+{
+ u8 instance = hdev->cur_adv_instance;
+ struct adv_info *adv_instance;
+
+ /* Ignore instance 0 */
+ if (instance == 0x00)
+ return 0;
+
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (!adv_instance)
+ return 0;
+
+ /* TODO: Take into account the "appearance" and "local-name" flags here.
+ * These are currently being ignored as they are not supported.
+ */
+ return adv_instance->scan_rsp_len;
+}
+
+void __hci_req_disable_advertising(struct hci_request *req)
+{
+ u8 enable = 0x00;
+
+ hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
+}
+
+static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance)
+{
+ u32 flags;
+ struct adv_info *adv_instance;
+
+ if (instance == 0x00) {
+ /* Instance 0 always manages the "Tx Power" and "Flags"
+ * fields
+ */
+ flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS;
+
+ /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting
+ * corresponds to the "connectable" instance flag.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE))
+ flags |= MGMT_ADV_FLAG_CONNECTABLE;
+
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE))
+ flags |= MGMT_ADV_FLAG_LIMITED_DISCOV;
+ else if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
+ flags |= MGMT_ADV_FLAG_DISCOV;
+
+ return flags;
+ }
+
+ adv_instance = hci_find_adv_instance(hdev, instance);
+
+ /* Return 0 when we got an invalid instance identifier. */
+ if (!adv_instance)
+ return 0;
+
+ return adv_instance->flags;
+}
+
+static bool adv_use_rpa(struct hci_dev *hdev, uint32_t flags)
+{
+ /* If privacy is not enabled don't use RPA */
+ if (!hci_dev_test_flag(hdev, HCI_PRIVACY))
+ return false;
+
+ /* If basic privacy mode is enabled use RPA */
+ if (!hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
+ return true;
+
+ /* If limited privacy mode is enabled don't use RPA if we're
+ * both discoverable and bondable.
+ */
+ if ((flags & MGMT_ADV_FLAG_DISCOV) &&
+ hci_dev_test_flag(hdev, HCI_BONDABLE))
+ return false;
+
+ /* We're neither bondable nor discoverable in the limited
+ * privacy mode, therefore use RPA.
+ */
+ return true;
+}
+
+void __hci_req_enable_advertising(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_le_set_adv_param cp;
+ u8 own_addr_type, enable = 0x01;
+ bool connectable;
+ u32 flags;
+
+ if (hci_conn_num(hdev, LE_LINK) > 0)
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV))
+ __hci_req_disable_advertising(req);
+
+ /* Clear the HCI_LE_ADV bit temporarily so that the
+ * hci_update_random_address knows that it's safe to go ahead
+ * and write a new random address. The flag will be set back on
+ * as soon as the SET_ADV_ENABLE HCI command completes.
+ */
+ hci_dev_clear_flag(hdev, HCI_LE_ADV);
+
+ flags = get_adv_instance_flags(hdev, hdev->cur_adv_instance);
+
+ /* If the "connectable" instance flag was not set, then choose between
+ * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
+ */
+ connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
+ mgmt_get_connectable(hdev);
+
+ /* Set require_privacy to true only when non-connectable
+ * advertising is used. In that case it is fine to use a
+ * non-resolvable private address.
+ */
+ if (hci_update_random_address(req, !connectable,
+ adv_use_rpa(hdev, flags),
+ &own_addr_type) < 0)
+ return;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.min_interval = cpu_to_le16(hdev->le_adv_min_interval);
+ cp.max_interval = cpu_to_le16(hdev->le_adv_max_interval);
+
+ if (connectable)
+ cp.type = LE_ADV_IND;
+ else if (get_cur_adv_instance_scan_rsp_len(hdev))
+ cp.type = LE_ADV_SCAN_IND;
+ else
+ cp.type = LE_ADV_NONCONN_IND;
+
+ cp.own_address_type = own_addr_type;
+ cp.channel_map = hdev->le_adv_channel_map;
+
+ hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp);
+
+ hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
+}
+
+static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
+{
+ u8 ad_len = 0;
+ size_t name_len;
+
+ name_len = strlen(hdev->dev_name);
+ if (name_len > 0) {
+ size_t max_len = HCI_MAX_AD_LENGTH - ad_len - 2;
+
+ if (name_len > max_len) {
+ name_len = max_len;
+ ptr[1] = EIR_NAME_SHORT;
+ } else
+ ptr[1] = EIR_NAME_COMPLETE;
+
+ ptr[0] = name_len + 1;
+
+ memcpy(ptr + 2, hdev->dev_name, name_len);
+
+ ad_len += (name_len + 2);
+ ptr += (name_len + 2);
+ }
+
+ return ad_len;
+}
+
+static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance,
+ u8 *ptr)
+{
+ struct adv_info *adv_instance;
+
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (!adv_instance)
+ return 0;
+
+ /* TODO: Set the appropriate entries based on advertising instance flags
+ * here once flags other than 0 are supported.
+ */
+ memcpy(ptr, adv_instance->scan_rsp_data,
+ adv_instance->scan_rsp_len);
+
+ return adv_instance->scan_rsp_len;
+}
+
+void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_le_set_scan_rsp_data cp;
+ u8 len;
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return;
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (instance)
+ len = create_instance_scan_rsp_data(hdev, instance, cp.data);
+ else
+ len = create_default_scan_rsp_data(hdev, cp.data);
+
+ if (hdev->scan_rsp_data_len == len &&
+ !memcmp(cp.data, hdev->scan_rsp_data, len))
+ return;
+
+ memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
+ hdev->scan_rsp_data_len = len;
+
+ cp.length = len;
+
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp);
+}
+
+static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
+{
+ struct adv_info *adv_instance = NULL;
+ u8 ad_len = 0, flags = 0;
+ u32 instance_flags;
+
+ /* Return 0 when the current instance identifier is invalid. */
+ if (instance) {
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (!adv_instance)
+ return 0;
+ }
+
+ instance_flags = get_adv_instance_flags(hdev, instance);
+
+ /* The Add Advertising command allows userspace to set both the general
+ * and limited discoverable flags.
+ */
+ if (instance_flags & MGMT_ADV_FLAG_DISCOV)
+ flags |= LE_AD_GENERAL;
+
+ if (instance_flags & MGMT_ADV_FLAG_LIMITED_DISCOV)
+ flags |= LE_AD_LIMITED;
+
+ if (flags || (instance_flags & MGMT_ADV_FLAG_MANAGED_FLAGS)) {
+ /* If a discovery flag wasn't provided, simply use the global
+ * settings.
+ */
+ if (!flags)
+ flags |= mgmt_get_adv_discov_flags(hdev);
+
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ flags |= LE_AD_NO_BREDR;
+
+ /* If flags would still be empty, then there is no need to
+ * include the "Flags" AD field".
+ */
+ if (flags) {
+ ptr[0] = 0x02;
+ ptr[1] = EIR_FLAGS;
+ ptr[2] = flags;
+
+ ad_len += 3;
+ ptr += 3;
+ }
+ }
+
+ if (adv_instance) {
+ memcpy(ptr, adv_instance->adv_data,
+ adv_instance->adv_data_len);
+ ad_len += adv_instance->adv_data_len;
+ ptr += adv_instance->adv_data_len;
+ }
+
+ /* Provide Tx Power only if we can provide a valid value for it */
+ if (hdev->adv_tx_power != HCI_TX_POWER_INVALID &&
+ (instance_flags & MGMT_ADV_FLAG_TX_POWER)) {
+ ptr[0] = 0x02;
+ ptr[1] = EIR_TX_POWER;
+ ptr[2] = (u8)hdev->adv_tx_power;
+
+ ad_len += 3;
+ ptr += 3;
+ }
+
+ return ad_len;
+}
+
+void __hci_req_update_adv_data(struct hci_request *req, u8 instance)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_le_set_adv_data cp;
+ u8 len;
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ return;
+
+ memset(&cp, 0, sizeof(cp));
+
+ len = create_instance_adv_data(hdev, instance, cp.data);
+
+ /* There's nothing to do if the data hasn't changed */
+ if (hdev->adv_data_len == len &&
+ memcmp(cp.data, hdev->adv_data, len) == 0)
+ return;
+
+ memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
+ hdev->adv_data_len = len;
+
+ cp.length = len;
+
+ hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp);
+}
+
+int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance)
+{
+ struct hci_request req;
+
+ hci_req_init(&req, hdev);
+ __hci_req_update_adv_data(&req, instance);
+
+ return hci_req_run(&req, NULL);
+}
+
+static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+{
+ BT_DBG("%s status %u", hdev->name, status);
+}
+
+void hci_req_reenable_advertising(struct hci_dev *hdev)
+{
+ struct hci_request req;
+
+ if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+ list_empty(&hdev->adv_instances))
+ return;
+
+ hci_req_init(&req, hdev);
+
+ if (hdev->cur_adv_instance) {
+ __hci_req_schedule_adv_instance(&req, hdev->cur_adv_instance,
+ true);
+ } else {
+ __hci_req_update_adv_data(&req, 0x00);
+ __hci_req_update_scan_rsp_data(&req, 0x00);
+ __hci_req_enable_advertising(&req);
+ }
+
+ hci_req_run(&req, adv_enable_complete);
+}
+
+static void adv_timeout_expire(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ adv_instance_expire.work);
+
+ struct hci_request req;
+ u8 instance;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ hdev->adv_instance_timeout = 0;
+
+ instance = hdev->cur_adv_instance;
+ if (instance == 0x00)
+ goto unlock;
+
+ hci_req_init(&req, hdev);
+
+ hci_req_clear_adv_instance(hdev, &req, instance, false);
+
+ if (list_empty(&hdev->adv_instances))
+ __hci_req_disable_advertising(&req);
+
+ hci_req_run(&req, NULL);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
+ bool force)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct adv_info *adv_instance = NULL;
+ u16 timeout;
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
+ list_empty(&hdev->adv_instances))
+ return -EPERM;
+
+ if (hdev->adv_instance_timeout)
+ return -EBUSY;
+
+ adv_instance = hci_find_adv_instance(hdev, instance);
+ if (!adv_instance)
+ return -ENOENT;
+
+ /* A zero timeout means unlimited advertising. As long as there is
+ * only one instance, duration should be ignored. We still set a timeout
+ * in case further instances are being added later on.
+ *
+ * If the remaining lifetime of the instance is more than the duration
+ * then the timeout corresponds to the duration, otherwise it will be
+ * reduced to the remaining instance lifetime.
+ */
+ if (adv_instance->timeout == 0 ||
+ adv_instance->duration <= adv_instance->remaining_time)
+ timeout = adv_instance->duration;
+ else
+ timeout = adv_instance->remaining_time;
+
+ /* The remaining time is being reduced unless the instance is being
+ * advertised without time limit.
+ */
+ if (adv_instance->timeout)
+ adv_instance->remaining_time =
+ adv_instance->remaining_time - timeout;
+
+ hdev->adv_instance_timeout = timeout;
+ queue_delayed_work(hdev->req_workqueue,
+ &hdev->adv_instance_expire,
+ msecs_to_jiffies(timeout * 1000));
+
+ /* If we're just re-scheduling the same instance again then do not
+ * execute any HCI commands. This happens when a single instance is
+ * being advertised.
+ */
+ if (!force && hdev->cur_adv_instance == instance &&
+ hci_dev_test_flag(hdev, HCI_LE_ADV))
+ return 0;
+
+ hdev->cur_adv_instance = instance;
+ __hci_req_update_adv_data(req, instance);
+ __hci_req_update_scan_rsp_data(req, instance);
+ __hci_req_enable_advertising(req);
+
+ return 0;
+}
+
+static void cancel_adv_timeout(struct hci_dev *hdev)
+{
+ if (hdev->adv_instance_timeout) {
+ hdev->adv_instance_timeout = 0;
+ cancel_delayed_work(&hdev->adv_instance_expire);
+ }
+}
+
+/* For a single instance:
+ * - force == true: The instance will be removed even when its remaining
+ * lifetime is not zero.
+ * - force == false: the instance will be deactivated but kept stored unless
+ * the remaining lifetime is zero.
+ *
+ * For instance == 0x00:
+ * - force == true: All instances will be removed regardless of their timeout
+ * setting.
+ * - force == false: Only instances that have a timeout will be removed.
+ */
+void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
+ u8 instance, bool force)
+{
+ struct adv_info *adv_instance, *n, *next_instance = NULL;
+ int err;
+ u8 rem_inst;
+
+ /* Cancel any timeout concerning the removed instance(s). */
+ if (!instance || hdev->cur_adv_instance == instance)
+ cancel_adv_timeout(hdev);
+
+ /* Get the next instance to advertise BEFORE we remove
+ * the current one. This can be the same instance again
+ * if there is only one instance.
+ */
+ if (instance && hdev->cur_adv_instance == instance)
+ next_instance = hci_get_next_instance(hdev, instance);
+
+ if (instance == 0x00) {
+ list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances,
+ list) {
+ if (!(force || adv_instance->timeout))
+ continue;
+
+ rem_inst = adv_instance->instance;
+ err = hci_remove_adv_instance(hdev, rem_inst);
+ if (!err)
+ mgmt_advertising_removed(NULL, hdev, rem_inst);
+ }
+ } else {
+ adv_instance = hci_find_adv_instance(hdev, instance);
+
+ if (force || (adv_instance && adv_instance->timeout &&
+ !adv_instance->remaining_time)) {
+ /* Don't advertise a removed instance. */
+ if (next_instance &&
+ next_instance->instance == instance)
+ next_instance = NULL;
+
+ err = hci_remove_adv_instance(hdev, instance);
+ if (!err)
+ mgmt_advertising_removed(NULL, hdev, instance);
+ }
+ }
+
+ if (!req || !hdev_is_powered(hdev) ||
+ hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ return;
+
+ if (next_instance)
+ __hci_req_schedule_adv_instance(req, next_instance->instance,
+ false);
+}
+
static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
{
struct hci_dev *hdev = req->hdev;
@@ -327,7 +1361,7 @@ static void set_random_addr(struct hci_request *req, bdaddr_t *rpa)
}
int hci_update_random_address(struct hci_request *req, bool require_privacy,
- u8 *own_addr_type)
+ bool use_rpa, u8 *own_addr_type)
{
struct hci_dev *hdev = req->hdev;
int err;
@@ -336,7 +1370,7 @@ int hci_update_random_address(struct hci_request *req, bool require_privacy,
* current RPA has expired or there is something else than
* the current RPA in use, then generate a new one.
*/
- if (hci_dev_test_flag(hdev, HCI_PRIVACY)) {
+ if (use_rpa) {
int to;
*own_addr_type = ADDR_LE_DEV_RANDOM;
@@ -432,7 +1466,7 @@ static bool disconnected_whitelist_entries(struct hci_dev *hdev)
return false;
}
-void __hci_update_page_scan(struct hci_request *req)
+void __hci_req_update_scan(struct hci_request *req)
{
struct hci_dev *hdev = req->hdev;
u8 scan;
@@ -452,117 +1486,175 @@ void __hci_update_page_scan(struct hci_request *req)
else
scan = SCAN_DISABLED;
- if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE))
- return;
-
if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
scan |= SCAN_INQUIRY;
+ if (test_bit(HCI_PSCAN, &hdev->flags) == !!(scan & SCAN_PAGE) &&
+ test_bit(HCI_ISCAN, &hdev->flags) == !!(scan & SCAN_INQUIRY))
+ return;
+
hci_req_add(req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
}
-void hci_update_page_scan(struct hci_dev *hdev)
+static int update_scan(struct hci_request *req, unsigned long opt)
{
- struct hci_request req;
+ hci_dev_lock(req->hdev);
+ __hci_req_update_scan(req);
+ hci_dev_unlock(req->hdev);
+ return 0;
+}
- hci_req_init(&req, hdev);
- __hci_update_page_scan(&req);
- hci_req_run(&req, NULL);
+static void scan_update_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev, scan_update);
+
+ hci_req_sync(hdev, update_scan, 0, HCI_CMD_TIMEOUT, NULL);
}
-/* This function controls the background scanning based on hdev->pend_le_conns
- * list. If there are pending LE connection we start the background scanning,
- * otherwise we stop it.
- *
- * This function requires the caller holds hdev->lock.
- */
-void __hci_update_background_scan(struct hci_request *req)
+static int connectable_update(struct hci_request *req, unsigned long opt)
{
struct hci_dev *hdev = req->hdev;
- if (!test_bit(HCI_UP, &hdev->flags) ||
- test_bit(HCI_INIT, &hdev->flags) ||
- hci_dev_test_flag(hdev, HCI_SETUP) ||
- hci_dev_test_flag(hdev, HCI_CONFIG) ||
- hci_dev_test_flag(hdev, HCI_AUTO_OFF) ||
- hci_dev_test_flag(hdev, HCI_UNREGISTER))
+ hci_dev_lock(hdev);
+
+ __hci_req_update_scan(req);
+
+ /* If BR/EDR is not enabled and we disable advertising as a
+ * by-product of disabling connectable, we need to update the
+ * advertising flags.
+ */
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
+ __hci_req_update_adv_data(req, hdev->cur_adv_instance);
+
+ /* Update the advertising parameters if necessary */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
+ !list_empty(&hdev->adv_instances))
+ __hci_req_enable_advertising(req);
+
+ __hci_update_background_scan(req);
+
+ hci_dev_unlock(hdev);
+
+ return 0;
+}
+
+static void connectable_update_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ connectable_update);
+ u8 status;
+
+ hci_req_sync(hdev, connectable_update, 0, HCI_CMD_TIMEOUT, &status);
+ mgmt_set_connectable_complete(hdev, status);
+}
+
+static u8 get_service_classes(struct hci_dev *hdev)
+{
+ struct bt_uuid *uuid;
+ u8 val = 0;
+
+ list_for_each_entry(uuid, &hdev->uuids, list)
+ val |= uuid->svc_hint;
+
+ return val;
+}
+
+void __hci_req_update_class(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ u8 cod[3];
+
+ BT_DBG("%s", hdev->name);
+
+ if (!hdev_is_powered(hdev))
return;
- /* No point in doing scanning if LE support hasn't been enabled */
- if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
+ if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
return;
- /* If discovery is active don't interfere with it */
- if (hdev->discovery.state != DISCOVERY_STOPPED)
+ if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE))
return;
- /* Reset RSSI and UUID filters when starting background scanning
- * since these filters are meant for service discovery only.
- *
- * The Start Discovery and Start Service Discovery operations
- * ensure to set proper values for RSSI threshold and UUID
- * filter list. So it is safe to just reset them here.
- */
- hci_discovery_filter_clear(hdev);
+ cod[0] = hdev->minor_class;
+ cod[1] = hdev->major_class;
+ cod[2] = get_service_classes(hdev);
- if (list_empty(&hdev->pend_le_conns) &&
- list_empty(&hdev->pend_le_reports)) {
- /* If there is no pending LE connections or devices
- * to be scanned for, we should stop the background
- * scanning.
- */
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE))
+ cod[1] |= 0x20;
- /* If controller is not scanning we are done. */
- if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
- return;
+ if (memcmp(cod, hdev->dev_class, 3) == 0)
+ return;
- hci_req_add_le_scan_disable(req);
+ hci_req_add(req, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod);
+}
- BT_DBG("%s stopping background scanning", hdev->name);
+static void write_iac(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_write_current_iac_lap cp;
+
+ if (!hci_dev_test_flag(hdev, HCI_DISCOVERABLE))
+ return;
+
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE)) {
+ /* Limited discoverable mode */
+ cp.num_iac = min_t(u8, hdev->num_iac, 2);
+ cp.iac_lap[0] = 0x00; /* LIAC */
+ cp.iac_lap[1] = 0x8b;
+ cp.iac_lap[2] = 0x9e;
+ cp.iac_lap[3] = 0x33; /* GIAC */
+ cp.iac_lap[4] = 0x8b;
+ cp.iac_lap[5] = 0x9e;
} else {
- /* If there is at least one pending LE connection, we should
- * keep the background scan running.
- */
+ /* General discoverable mode */
+ cp.num_iac = 1;
+ cp.iac_lap[0] = 0x33; /* GIAC */
+ cp.iac_lap[1] = 0x8b;
+ cp.iac_lap[2] = 0x9e;
+ }
- /* If controller is connecting, we should not start scanning
- * since some controllers are not able to scan and connect at
- * the same time.
- */
- if (hci_lookup_le_connect(hdev))
- return;
+ hci_req_add(req, HCI_OP_WRITE_CURRENT_IAC_LAP,
+ (cp.num_iac * 3) + 1, &cp);
+}
- /* If controller is currently scanning, we stop it to ensure we
- * don't miss any advertising (due to duplicates filter).
- */
- if (hci_dev_test_flag(hdev, HCI_LE_SCAN))
- hci_req_add_le_scan_disable(req);
+static int discoverable_update(struct hci_request *req, unsigned long opt)
+{
+ struct hci_dev *hdev = req->hdev;
- hci_req_add_le_passive_scan(req);
+ hci_dev_lock(hdev);
- BT_DBG("%s starting background scanning", hdev->name);
+ if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
+ write_iac(req);
+ __hci_req_update_scan(req);
+ __hci_req_update_class(req);
}
-}
-static void update_background_scan_complete(struct hci_dev *hdev, u8 status,
- u16 opcode)
-{
- if (status)
- BT_DBG("HCI request failed to update background scanning: "
- "status 0x%2.2x", status);
-}
+ /* Advertising instances don't use the global discoverable setting, so
+ * only update AD if advertising was enabled using Set Advertising.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING)) {
+ __hci_req_update_adv_data(req, 0x00);
-void hci_update_background_scan(struct hci_dev *hdev)
-{
- int err;
- struct hci_request req;
+ /* Discoverable mode affects the local advertising
+ * address in limited privacy mode.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
+ __hci_req_enable_advertising(req);
+ }
- hci_req_init(&req, hdev);
+ hci_dev_unlock(hdev);
- __hci_update_background_scan(&req);
+ return 0;
+}
- err = hci_req_run(&req, update_background_scan_complete);
- if (err && err != -ENODATA)
- BT_ERR("Failed to run HCI request: err %d", err);
+static void discoverable_update_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ discoverable_update);
+ u8 status;
+
+ hci_req_sync(hdev, discoverable_update, 0, HCI_CMD_TIMEOUT, &status);
+ mgmt_set_discoverable_complete(hdev, status);
}
void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
@@ -657,3 +1749,575 @@ int hci_abort_conn(struct hci_conn *conn, u8 reason)
return 0;
}
+
+static int update_bg_scan(struct hci_request *req, unsigned long opt)
+{
+ hci_dev_lock(req->hdev);
+ __hci_update_background_scan(req);
+ hci_dev_unlock(req->hdev);
+ return 0;
+}
+
+static void bg_scan_update(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ bg_scan_update);
+ struct hci_conn *conn;
+ u8 status;
+ int err;
+
+ err = hci_req_sync(hdev, update_bg_scan, 0, HCI_CMD_TIMEOUT, &status);
+ if (!err)
+ return;
+
+ hci_dev_lock(hdev);
+
+ conn = hci_conn_hash_lookup_state(hdev, LE_LINK, BT_CONNECT);
+ if (conn)
+ hci_le_conn_failed(conn, status);
+
+ hci_dev_unlock(hdev);
+}
+
+static int le_scan_disable(struct hci_request *req, unsigned long opt)
+{
+ hci_req_add_le_scan_disable(req);
+ return 0;
+}
+
+static int bredr_inquiry(struct hci_request *req, unsigned long opt)
+{
+ u8 length = opt;
+ const u8 giac[3] = { 0x33, 0x8b, 0x9e };
+ const u8 liac[3] = { 0x00, 0x8b, 0x9e };
+ struct hci_cp_inquiry cp;
+
+ BT_DBG("%s", req->hdev->name);
+
+ hci_dev_lock(req->hdev);
+ hci_inquiry_cache_flush(req->hdev);
+ hci_dev_unlock(req->hdev);
+
+ memset(&cp, 0, sizeof(cp));
+
+ if (req->hdev->discovery.limited)
+ memcpy(&cp.lap, liac, sizeof(cp.lap));
+ else
+ memcpy(&cp.lap, giac, sizeof(cp.lap));
+
+ cp.length = length;
+
+ hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp);
+
+ return 0;
+}
+
+static void le_scan_disable_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ le_scan_disable.work);
+ u8 status;
+
+ BT_DBG("%s", hdev->name);
+
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ return;
+
+ cancel_delayed_work(&hdev->le_scan_restart);
+
+ hci_req_sync(hdev, le_scan_disable, 0, HCI_CMD_TIMEOUT, &status);
+ if (status) {
+ BT_ERR("Failed to disable LE scan: status 0x%02x", status);
+ return;
+ }
+
+ hdev->discovery.scan_start = 0;
+
+ /* If we were running LE only scan, change discovery state. If
+ * we were running both LE and BR/EDR inquiry simultaneously,
+ * and BR/EDR inquiry is already finished, stop discovery,
+ * otherwise BR/EDR inquiry will stop discovery when finished.
+ * If we will resolve remote device name, do not change
+ * discovery state.
+ */
+
+ if (hdev->discovery.type == DISCOV_TYPE_LE)
+ goto discov_stopped;
+
+ if (hdev->discovery.type != DISCOV_TYPE_INTERLEAVED)
+ return;
+
+ if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks)) {
+ if (!test_bit(HCI_INQUIRY, &hdev->flags) &&
+ hdev->discovery.state != DISCOVERY_RESOLVING)
+ goto discov_stopped;
+
+ return;
+ }
+
+ hci_req_sync(hdev, bredr_inquiry, DISCOV_INTERLEAVED_INQUIRY_LEN,
+ HCI_CMD_TIMEOUT, &status);
+ if (status) {
+ BT_ERR("Inquiry failed: status 0x%02x", status);
+ goto discov_stopped;
+ }
+
+ return;
+
+discov_stopped:
+ hci_dev_lock(hdev);
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ hci_dev_unlock(hdev);
+}
+
+static int le_scan_restart(struct hci_request *req, unsigned long opt)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_le_set_scan_enable cp;
+
+ /* If controller is not scanning we are done. */
+ if (!hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ return 0;
+
+ hci_req_add_le_scan_disable(req);
+
+ memset(&cp, 0, sizeof(cp));
+ cp.enable = LE_SCAN_ENABLE;
+ cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(cp), &cp);
+
+ return 0;
+}
+
+static void le_scan_restart_work(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ le_scan_restart.work);
+ unsigned long timeout, duration, scan_start, now;
+ u8 status;
+
+ BT_DBG("%s", hdev->name);
+
+ hci_req_sync(hdev, le_scan_restart, 0, HCI_CMD_TIMEOUT, &status);
+ if (status) {
+ BT_ERR("Failed to restart LE scan: status %d", status);
+ return;
+ }
+
+ hci_dev_lock(hdev);
+
+ if (!test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) ||
+ !hdev->discovery.scan_start)
+ goto unlock;
+
+ /* When the scan was started, hdev->le_scan_disable has been queued
+ * after duration from scan_start. During scan restart this job
+ * has been canceled, and we need to queue it again after proper
+ * timeout, to make sure that scan does not run indefinitely.
+ */
+ duration = hdev->discovery.scan_duration;
+ scan_start = hdev->discovery.scan_start;
+ now = jiffies;
+ if (now - scan_start <= duration) {
+ int elapsed;
+
+ if (now >= scan_start)
+ elapsed = now - scan_start;
+ else
+ elapsed = ULONG_MAX - scan_start + now;
+
+ timeout = duration - elapsed;
+ } else {
+ timeout = 0;
+ }
+
+ queue_delayed_work(hdev->req_workqueue,
+ &hdev->le_scan_disable, timeout);
+
+unlock:
+ hci_dev_unlock(hdev);
+}
+
+static void disable_advertising(struct hci_request *req)
+{
+ u8 enable = 0x00;
+
+ hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
+}
+
+static int active_scan(struct hci_request *req, unsigned long opt)
+{
+ uint16_t interval = opt;
+ struct hci_dev *hdev = req->hdev;
+ struct hci_cp_le_set_scan_param param_cp;
+ struct hci_cp_le_set_scan_enable enable_cp;
+ u8 own_addr_type;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ADV)) {
+ hci_dev_lock(hdev);
+
+ /* Don't let discovery abort an outgoing connection attempt
+ * that's using directed advertising.
+ */
+ if (hci_lookup_le_connect(hdev)) {
+ hci_dev_unlock(hdev);
+ return -EBUSY;
+ }
+
+ cancel_adv_timeout(hdev);
+ hci_dev_unlock(hdev);
+
+ disable_advertising(req);
+ }
+
+ /* If controller is scanning, it means the background scanning is
+ * running. Thus, we should temporarily stop it in order to set the
+ * discovery scanning parameters.
+ */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN))
+ hci_req_add_le_scan_disable(req);
+
+ /* All active scans will be done with either a resolvable private
+ * address (when privacy feature has been enabled) or non-resolvable
+ * private address.
+ */
+ err = hci_update_random_address(req, true, scan_use_rpa(hdev),
+ &own_addr_type);
+ if (err < 0)
+ own_addr_type = ADDR_LE_DEV_PUBLIC;
+
+ memset(&param_cp, 0, sizeof(param_cp));
+ param_cp.type = LE_SCAN_ACTIVE;
+ param_cp.interval = cpu_to_le16(interval);
+ param_cp.window = cpu_to_le16(DISCOV_LE_SCAN_WIN);
+ param_cp.own_address_type = own_addr_type;
+
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp),
+ &param_cp);
+
+ memset(&enable_cp, 0, sizeof(enable_cp));
+ enable_cp.enable = LE_SCAN_ENABLE;
+ enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
+
+ hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp),
+ &enable_cp);
+
+ return 0;
+}
+
+static int interleaved_discov(struct hci_request *req, unsigned long opt)
+{
+ int err;
+
+ BT_DBG("%s", req->hdev->name);
+
+ err = active_scan(req, opt);
+ if (err)
+ return err;
+
+ return bredr_inquiry(req, DISCOV_BREDR_INQUIRY_LEN);
+}
+
+static void start_discovery(struct hci_dev *hdev, u8 *status)
+{
+ unsigned long timeout;
+
+ BT_DBG("%s type %u", hdev->name, hdev->discovery.type);
+
+ switch (hdev->discovery.type) {
+ case DISCOV_TYPE_BREDR:
+ if (!hci_dev_test_flag(hdev, HCI_INQUIRY))
+ hci_req_sync(hdev, bredr_inquiry,
+ DISCOV_BREDR_INQUIRY_LEN, HCI_CMD_TIMEOUT,
+ status);
+ return;
+ case DISCOV_TYPE_INTERLEAVED:
+ /* When running simultaneous discovery, the LE scanning time
+ * should occupy the whole discovery time sine BR/EDR inquiry
+ * and LE scanning are scheduled by the controller.
+ *
+ * For interleaving discovery in comparison, BR/EDR inquiry
+ * and LE scanning are done sequentially with separate
+ * timeouts.
+ */
+ if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY,
+ &hdev->quirks)) {
+ timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT);
+ /* During simultaneous discovery, we double LE scan
+ * interval. We must leave some time for the controller
+ * to do BR/EDR inquiry.
+ */
+ hci_req_sync(hdev, interleaved_discov,
+ DISCOV_LE_SCAN_INT * 2, HCI_CMD_TIMEOUT,
+ status);
+ break;
+ }
+
+ timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout);
+ hci_req_sync(hdev, active_scan, DISCOV_LE_SCAN_INT,
+ HCI_CMD_TIMEOUT, status);
+ break;
+ case DISCOV_TYPE_LE:
+ timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT);
+ hci_req_sync(hdev, active_scan, DISCOV_LE_SCAN_INT,
+ HCI_CMD_TIMEOUT, status);
+ break;
+ default:
+ *status = HCI_ERROR_UNSPECIFIED;
+ return;
+ }
+
+ if (*status)
+ return;
+
+ BT_DBG("%s timeout %u ms", hdev->name, jiffies_to_msecs(timeout));
+
+ /* When service discovery is used and the controller has a
+ * strict duplicate filter, it is important to remember the
+ * start and duration of the scan. This is required for
+ * restarting scanning during the discovery phase.
+ */
+ if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER, &hdev->quirks) &&
+ hdev->discovery.result_filtering) {
+ hdev->discovery.scan_start = jiffies;
+ hdev->discovery.scan_duration = timeout;
+ }
+
+ queue_delayed_work(hdev->req_workqueue, &hdev->le_scan_disable,
+ timeout);
+}
+
+bool hci_req_stop_discovery(struct hci_request *req)
+{
+ struct hci_dev *hdev = req->hdev;
+ struct discovery_state *d = &hdev->discovery;
+ struct hci_cp_remote_name_req_cancel cp;
+ struct inquiry_entry *e;
+ bool ret = false;
+
+ BT_DBG("%s state %u", hdev->name, hdev->discovery.state);
+
+ if (d->state == DISCOVERY_FINDING || d->state == DISCOVERY_STOPPING) {
+ if (test_bit(HCI_INQUIRY, &hdev->flags))
+ hci_req_add(req, HCI_OP_INQUIRY_CANCEL, 0, NULL);
+
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
+ cancel_delayed_work(&hdev->le_scan_disable);
+ hci_req_add_le_scan_disable(req);
+ }
+
+ ret = true;
+ } else {
+ /* Passive scanning */
+ if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
+ hci_req_add_le_scan_disable(req);
+ ret = true;
+ }
+ }
+
+ /* No further actions needed for LE-only discovery */
+ if (d->type == DISCOV_TYPE_LE)
+ return ret;
+
+ if (d->state == DISCOVERY_RESOLVING || d->state == DISCOVERY_STOPPING) {
+ e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY,
+ NAME_PENDING);
+ if (!e)
+ return ret;
+
+ bacpy(&cp.bdaddr, &e->data.bdaddr);
+ hci_req_add(req, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp),
+ &cp);
+ ret = true;
+ }
+
+ return ret;
+}
+
+static int stop_discovery(struct hci_request *req, unsigned long opt)
+{
+ hci_dev_lock(req->hdev);
+ hci_req_stop_discovery(req);
+ hci_dev_unlock(req->hdev);
+
+ return 0;
+}
+
+static void discov_update(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ discov_update);
+ u8 status = 0;
+
+ switch (hdev->discovery.state) {
+ case DISCOVERY_STARTING:
+ start_discovery(hdev, &status);
+ mgmt_start_discovery_complete(hdev, status);
+ if (status)
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ else
+ hci_discovery_set_state(hdev, DISCOVERY_FINDING);
+ break;
+ case DISCOVERY_STOPPING:
+ hci_req_sync(hdev, stop_discovery, 0, HCI_CMD_TIMEOUT, &status);
+ mgmt_stop_discovery_complete(hdev, status);
+ if (!status)
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
+ break;
+ case DISCOVERY_STOPPED:
+ default:
+ return;
+ }
+}
+
+static void discov_off(struct work_struct *work)
+{
+ struct hci_dev *hdev = container_of(work, struct hci_dev,
+ discov_off.work);
+
+ BT_DBG("%s", hdev->name);
+
+ hci_dev_lock(hdev);
+
+ /* When discoverable timeout triggers, then just make sure
+ * the limited discoverable flag is cleared. Even in the case
+ * of a timeout triggered from general discoverable, it is
+ * safe to unconditionally clear the flag.
+ */
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+ hdev->discov_timeout = 0;
+
+ hci_dev_unlock(hdev);
+
+ hci_req_sync(hdev, discoverable_update, 0, HCI_CMD_TIMEOUT, NULL);
+ mgmt_new_settings(hdev);
+}
+
+static int powered_update_hci(struct hci_request *req, unsigned long opt)
+{
+ struct hci_dev *hdev = req->hdev;
+ u8 link_sec;
+
+ hci_dev_lock(hdev);
+
+ if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED) &&
+ !lmp_host_ssp_capable(hdev)) {
+ u8 mode = 0x01;
+
+ hci_req_add(req, HCI_OP_WRITE_SSP_MODE, sizeof(mode), &mode);
+
+ if (bredr_sc_enabled(hdev) && !lmp_host_sc_capable(hdev)) {
+ u8 support = 0x01;
+
+ hci_req_add(req, HCI_OP_WRITE_SC_SUPPORT,
+ sizeof(support), &support);
+ }
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED) &&
+ lmp_bredr_capable(hdev)) {
+ struct hci_cp_write_le_host_supported cp;
+
+ cp.le = 0x01;
+ cp.simul = 0x00;
+
+ /* Check first if we already have the right
+ * host state (host features set)
+ */
+ if (cp.le != lmp_host_le_capable(hdev) ||
+ cp.simul != lmp_host_le_br_capable(hdev))
+ hci_req_add(req, HCI_OP_WRITE_LE_HOST_SUPPORTED,
+ sizeof(cp), &cp);
+ }
+
+ if (hci_dev_test_flag(hdev, HCI_LE_ENABLED)) {
+ /* Make sure the controller has a good default for
+ * advertising data. This also applies to the case
+ * where BR/EDR was toggled during the AUTO_OFF phase.
+ */
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
+ list_empty(&hdev->adv_instances)) {
+ __hci_req_update_adv_data(req, 0x00);
+ __hci_req_update_scan_rsp_data(req, 0x00);
+
+ if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
+ __hci_req_enable_advertising(req);
+ } else if (!list_empty(&hdev->adv_instances)) {
+ struct adv_info *adv_instance;
+
+ adv_instance = list_first_entry(&hdev->adv_instances,
+ struct adv_info, list);
+ __hci_req_schedule_adv_instance(req,
+ adv_instance->instance,
+ true);
+ }
+ }
+
+ link_sec = hci_dev_test_flag(hdev, HCI_LINK_SECURITY);
+ if (link_sec != test_bit(HCI_AUTH, &hdev->flags))
+ hci_req_add(req, HCI_OP_WRITE_AUTH_ENABLE,
+ sizeof(link_sec), &link_sec);
+
+ if (lmp_bredr_capable(hdev)) {
+ if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE))
+ __hci_req_write_fast_connectable(req, true);
+ else
+ __hci_req_write_fast_connectable(req, false);
+ __hci_req_update_scan(req);
+ __hci_req_update_class(req);
+ __hci_req_update_name(req);
+ __hci_req_update_eir(req);
+ }
+
+ hci_dev_unlock(hdev);
+ return 0;
+}
+
+int __hci_req_hci_power_on(struct hci_dev *hdev)
+{
+ /* Register the available SMP channels (BR/EDR and LE) only when
+ * successfully powering on the controller. This late
+ * registration is required so that LE SMP can clearly decide if
+ * the public address or static address is used.
+ */
+ smp_register(hdev);
+
+ return __hci_req_sync(hdev, powered_update_hci, 0, HCI_CMD_TIMEOUT,
+ NULL);
+}
+
+void hci_request_setup(struct hci_dev *hdev)
+{
+ INIT_WORK(&hdev->discov_update, discov_update);
+ INIT_WORK(&hdev->bg_scan_update, bg_scan_update);
+ INIT_WORK(&hdev->scan_update, scan_update_work);
+ INIT_WORK(&hdev->connectable_update, connectable_update_work);
+ INIT_WORK(&hdev->discoverable_update, discoverable_update_work);
+ INIT_DELAYED_WORK(&hdev->discov_off, discov_off);
+ INIT_DELAYED_WORK(&hdev->le_scan_disable, le_scan_disable_work);
+ INIT_DELAYED_WORK(&hdev->le_scan_restart, le_scan_restart_work);
+ INIT_DELAYED_WORK(&hdev->adv_instance_expire, adv_timeout_expire);
+}
+
+void hci_request_cancel_all(struct hci_dev *hdev)
+{
+ hci_req_sync_cancel(hdev, ENODEV);
+
+ cancel_work_sync(&hdev->discov_update);
+ cancel_work_sync(&hdev->bg_scan_update);
+ cancel_work_sync(&hdev->scan_update);
+ cancel_work_sync(&hdev->connectable_update);
+ cancel_work_sync(&hdev->discoverable_update);
+ cancel_delayed_work_sync(&hdev->discov_off);
+ cancel_delayed_work_sync(&hdev->le_scan_disable);
+ cancel_delayed_work_sync(&hdev->le_scan_restart);
+
+ if (hdev->adv_instance_timeout) {
+ cancel_delayed_work_sync(&hdev->adv_instance_expire);
+ hdev->adv_instance_timeout = 0;
+ }
+}
diff --git a/net/bluetooth/hci_request.h b/net/bluetooth/hci_request.h
index 25c7f1305dcb..b2d044bdc732 100644
--- a/net/bluetooth/hci_request.h
+++ b/net/bluetooth/hci_request.h
@@ -20,6 +20,9 @@
SOFTWARE IS DISCLAIMED.
*/
+#define hci_req_sync_lock(hdev) mutex_lock(&hdev->req_lock)
+#define hci_req_sync_unlock(hdev) mutex_unlock(&hdev->req_lock)
+
struct hci_request {
struct hci_dev *hdev;
struct sk_buff_head cmd_q;
@@ -41,21 +44,61 @@ void hci_req_cmd_complete(struct hci_dev *hdev, u16 opcode, u8 status,
hci_req_complete_t *req_complete,
hci_req_complete_skb_t *req_complete_skb);
+int hci_req_sync(struct hci_dev *hdev, int (*req)(struct hci_request *req,
+ unsigned long opt),
+ unsigned long opt, u32 timeout, u8 *hci_status);
+int __hci_req_sync(struct hci_dev *hdev, int (*func)(struct hci_request *req,
+ unsigned long opt),
+ unsigned long opt, u32 timeout, u8 *hci_status);
+void hci_req_sync_cancel(struct hci_dev *hdev, int err);
+
struct sk_buff *hci_prepare_cmd(struct hci_dev *hdev, u16 opcode, u32 plen,
const void *param);
+int __hci_req_hci_power_on(struct hci_dev *hdev);
+
+void __hci_req_write_fast_connectable(struct hci_request *req, bool enable);
+void __hci_req_update_name(struct hci_request *req);
+void __hci_req_update_eir(struct hci_request *req);
+
void hci_req_add_le_scan_disable(struct hci_request *req);
void hci_req_add_le_passive_scan(struct hci_request *req);
-void hci_update_page_scan(struct hci_dev *hdev);
-void __hci_update_page_scan(struct hci_request *req);
+void hci_req_reenable_advertising(struct hci_dev *hdev);
+void __hci_req_enable_advertising(struct hci_request *req);
+void __hci_req_disable_advertising(struct hci_request *req);
+void __hci_req_update_adv_data(struct hci_request *req, u8 instance);
+int hci_req_update_adv_data(struct hci_dev *hdev, u8 instance);
+void __hci_req_update_scan_rsp_data(struct hci_request *req, u8 instance);
-int hci_update_random_address(struct hci_request *req, bool require_privacy,
- u8 *own_addr_type);
+int __hci_req_schedule_adv_instance(struct hci_request *req, u8 instance,
+ bool force);
+void hci_req_clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
+ u8 instance, bool force);
+
+void __hci_req_update_class(struct hci_request *req);
-void hci_update_background_scan(struct hci_dev *hdev);
-void __hci_update_background_scan(struct hci_request *req);
+/* Returns true if HCI commands were queued */
+bool hci_req_stop_discovery(struct hci_request *req);
+
+static inline void hci_req_update_scan(struct hci_dev *hdev)
+{
+ queue_work(hdev->req_workqueue, &hdev->scan_update);
+}
+
+void __hci_req_update_scan(struct hci_request *req);
+
+int hci_update_random_address(struct hci_request *req, bool require_privacy,
+ bool use_rpa, u8 *own_addr_type);
int hci_abort_conn(struct hci_conn *conn, u8 reason);
void __hci_abort_conn(struct hci_request *req, struct hci_conn *conn,
u8 reason);
+
+static inline void hci_update_background_scan(struct hci_dev *hdev)
+{
+ queue_work(hdev->req_workqueue, &hdev->bg_scan_update);
+}
+
+void hci_request_setup(struct hci_dev *hdev);
+void hci_request_cancel_all(struct hci_dev *hdev);
diff --git a/net/bluetooth/hci_sock.c b/net/bluetooth/hci_sock.c
index b1eb8c09a660..1298d723c0e0 100644
--- a/net/bluetooth/hci_sock.c
+++ b/net/bluetooth/hci_sock.c
@@ -25,6 +25,7 @@
/* Bluetooth HCI sockets. */
#include <linux/export.h>
+#include <linux/utsname.h>
#include <asm/unaligned.h>
#include <net/bluetooth/bluetooth.h>
@@ -120,13 +121,13 @@ static bool is_filtered_packet(struct sock *sk, struct sk_buff *skb)
/* Apply filter */
flt = &hci_pi(sk)->filter;
- flt_type = bt_cb(skb)->pkt_type & HCI_FLT_TYPE_BITS;
+ flt_type = hci_skb_pkt_type(skb) & HCI_FLT_TYPE_BITS;
if (!test_bit(flt_type, &flt->type_mask))
return true;
/* Extra filter for event packets only */
- if (bt_cb(skb)->pkt_type != HCI_EVENT_PKT)
+ if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT)
return false;
flt_event = (*(__u8 *)skb->data & HCI_FLT_EVENT_BITS);
@@ -170,19 +171,19 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
continue;
if (hci_pi(sk)->channel == HCI_CHANNEL_RAW) {
- if (bt_cb(skb)->pkt_type != HCI_COMMAND_PKT &&
- bt_cb(skb)->pkt_type != HCI_EVENT_PKT &&
- bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT &&
- bt_cb(skb)->pkt_type != HCI_SCODATA_PKT)
+ if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT &&
+ hci_skb_pkt_type(skb) != HCI_EVENT_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT)
continue;
if (is_filtered_packet(sk, skb))
continue;
} else if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
if (!bt_cb(skb)->incoming)
continue;
- if (bt_cb(skb)->pkt_type != HCI_EVENT_PKT &&
- bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT &&
- bt_cb(skb)->pkt_type != HCI_SCODATA_PKT)
+ if (hci_skb_pkt_type(skb) != HCI_EVENT_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT)
continue;
} else {
/* Don't send frame to other channel types */
@@ -196,7 +197,7 @@ void hci_send_to_sock(struct hci_dev *hdev, struct sk_buff *skb)
continue;
/* Put type byte before the data */
- memcpy(skb_push(skb_copy, 1), &bt_cb(skb)->pkt_type, 1);
+ memcpy(skb_push(skb_copy, 1), &hci_skb_pkt_type(skb), 1);
}
nskb = skb_clone(skb_copy, GFP_ATOMIC);
@@ -262,7 +263,7 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
BT_DBG("hdev %p len %d", hdev, skb->len);
- switch (bt_cb(skb)->pkt_type) {
+ switch (hci_skb_pkt_type(skb)) {
case HCI_COMMAND_PKT:
opcode = cpu_to_le16(HCI_MON_COMMAND_PKT);
break;
@@ -294,7 +295,7 @@ void hci_send_to_monitor(struct hci_dev *hdev, struct sk_buff *skb)
return;
/* Put header before the data */
- hdr = (void *) skb_push(skb_copy, HCI_MON_HDR_SIZE);
+ hdr = (void *)skb_push(skb_copy, HCI_MON_HDR_SIZE);
hdr->opcode = opcode;
hdr->index = cpu_to_le16(hdev->id);
hdr->len = cpu_to_le16(skb->len);
@@ -375,7 +376,7 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
__net_timestamp(skb);
- hdr = (void *) skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
hdr->opcode = opcode;
hdr->index = cpu_to_le16(hdev->id);
hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
@@ -383,6 +384,38 @@ static struct sk_buff *create_monitor_event(struct hci_dev *hdev, int event)
return skb;
}
+static void __printf(2, 3)
+send_monitor_note(struct sock *sk, const char *fmt, ...)
+{
+ size_t len;
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+ va_list args;
+
+ va_start(args, fmt);
+ len = vsnprintf(NULL, 0, fmt, args);
+ va_end(args);
+
+ skb = bt_skb_alloc(len + 1, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ va_start(args, fmt);
+ vsprintf(skb_put(skb, len), fmt, args);
+ *skb_put(skb, 1) = 0;
+ va_end(args);
+
+ __net_timestamp(skb);
+
+ hdr = (void *)skb_push(skb, HCI_MON_HDR_SIZE);
+ hdr->opcode = cpu_to_le16(HCI_MON_SYSTEM_NOTE);
+ hdr->index = cpu_to_le16(HCI_DEV_NONE);
+ hdr->len = cpu_to_le16(skb->len - HCI_MON_HDR_SIZE);
+
+ if (sock_queue_rcv_skb(sk, skb))
+ kfree_skb(skb);
+}
+
static void send_monitor_replay(struct sock *sk)
{
struct hci_dev *hdev;
@@ -436,18 +469,18 @@ static void hci_si_event(struct hci_dev *hdev, int type, int dlen, void *data)
if (!skb)
return;
- hdr = (void *) skb_put(skb, HCI_EVENT_HDR_SIZE);
+ hdr = (void *)skb_put(skb, HCI_EVENT_HDR_SIZE);
hdr->evt = HCI_EV_STACK_INTERNAL;
hdr->plen = sizeof(*ev) + dlen;
- ev = (void *) skb_put(skb, sizeof(*ev) + dlen);
+ ev = (void *)skb_put(skb, sizeof(*ev) + dlen);
ev->type = type;
memcpy(ev->data, data, dlen);
bt_cb(skb)->incoming = 1;
__net_timestamp(skb);
- bt_cb(skb)->pkt_type = HCI_EVENT_PKT;
+ hci_skb_pkt_type(skb) = HCI_EVENT_PKT;
hci_send_to_sock(hdev, skb);
kfree_skb(skb);
}
@@ -653,20 +686,20 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd,
return -EOPNOTSUPP;
case HCIGETCONNINFO:
- return hci_get_conn_info(hdev, (void __user *) arg);
+ return hci_get_conn_info(hdev, (void __user *)arg);
case HCIGETAUTHINFO:
- return hci_get_auth_info(hdev, (void __user *) arg);
+ return hci_get_auth_info(hdev, (void __user *)arg);
case HCIBLOCKADDR:
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- return hci_sock_blacklist_add(hdev, (void __user *) arg);
+ return hci_sock_blacklist_add(hdev, (void __user *)arg);
case HCIUNBLOCKADDR:
if (!capable(CAP_NET_ADMIN))
return -EPERM;
- return hci_sock_blacklist_del(hdev, (void __user *) arg);
+ return hci_sock_blacklist_del(hdev, (void __user *)arg);
}
return -ENOIOCTLCMD;
@@ -675,7 +708,7 @@ static int hci_sock_bound_ioctl(struct sock *sk, unsigned int cmd,
static int hci_sock_ioctl(struct socket *sock, unsigned int cmd,
unsigned long arg)
{
- void __user *argp = (void __user *) arg;
+ void __user *argp = (void __user *)arg;
struct sock *sk = sock->sk;
int err;
@@ -872,11 +905,28 @@ static int hci_sock_bind(struct socket *sock, struct sockaddr *addr,
*/
hci_sock_set_flag(sk, HCI_SOCK_TRUSTED);
+ send_monitor_note(sk, "Linux version %s (%s)",
+ init_utsname()->release,
+ init_utsname()->machine);
+ send_monitor_note(sk, "Bluetooth subsystem version %s",
+ BT_SUBSYS_VERSION);
send_monitor_replay(sk);
atomic_inc(&monitor_promisc);
break;
+ case HCI_CHANNEL_LOGGING:
+ if (haddr.hci_dev != HCI_DEV_NONE) {
+ err = -EINVAL;
+ goto done;
+ }
+
+ if (!capable(CAP_NET_ADMIN)) {
+ err = -EPERM;
+ goto done;
+ }
+ break;
+
default:
if (!hci_mgmt_chan_find(haddr.hci_channel)) {
err = -EINVAL;
@@ -926,7 +976,7 @@ done:
static int hci_sock_getname(struct socket *sock, struct sockaddr *addr,
int *addr_len, int peer)
{
- struct sockaddr_hci *haddr = (struct sockaddr_hci *) addr;
+ struct sockaddr_hci *haddr = (struct sockaddr_hci *)addr;
struct sock *sk = sock->sk;
struct hci_dev *hdev;
int err = 0;
@@ -991,8 +1041,8 @@ static void hci_sock_cmsg(struct sock *sk, struct msghdr *msg,
}
}
-static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
- int flags)
+static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t len, int flags)
{
int noblock = flags & MSG_DONTWAIT;
struct sock *sk = sock->sk;
@@ -1004,6 +1054,9 @@ static int hci_sock_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (flags & MSG_OOB)
return -EOPNOTSUPP;
+ if (hci_pi(sk)->channel == HCI_CHANNEL_LOGGING)
+ return -EOPNOTSUPP;
+
if (sk->sk_state == BT_CLOSED)
return 0;
@@ -1150,6 +1203,90 @@ done:
return err;
}
+static int hci_logging_frame(struct sock *sk, struct msghdr *msg, int len)
+{
+ struct hci_mon_hdr *hdr;
+ struct sk_buff *skb;
+ struct hci_dev *hdev;
+ u16 index;
+ int err;
+
+ /* The logging frame consists at minimum of the standard header,
+ * the priority byte, the ident length byte and at least one string
+ * terminator NUL byte. Anything shorter are invalid packets.
+ */
+ if (len < sizeof(*hdr) + 3)
+ return -EINVAL;
+
+ skb = bt_skb_send_alloc(sk, len, msg->msg_flags & MSG_DONTWAIT, &err);
+ if (!skb)
+ return err;
+
+ if (memcpy_from_msg(skb_put(skb, len), msg, len)) {
+ err = -EFAULT;
+ goto drop;
+ }
+
+ hdr = (void *)skb->data;
+
+ if (__le16_to_cpu(hdr->len) != len - sizeof(*hdr)) {
+ err = -EINVAL;
+ goto drop;
+ }
+
+ if (__le16_to_cpu(hdr->opcode) == 0x0000) {
+ __u8 priority = skb->data[sizeof(*hdr)];
+ __u8 ident_len = skb->data[sizeof(*hdr) + 1];
+
+ /* Only the priorities 0-7 are valid and with that any other
+ * value results in an invalid packet.
+ *
+ * The priority byte is followed by an ident length byte and
+ * the NUL terminated ident string. Check that the ident
+ * length is not overflowing the packet and also that the
+ * ident string itself is NUL terminated. In case the ident
+ * length is zero, the length value actually doubles as NUL
+ * terminator identifier.
+ *
+ * The message follows the ident string (if present) and
+ * must be NUL terminated. Otherwise it is not a valid packet.
+ */
+ if (priority > 7 || skb->data[len - 1] != 0x00 ||
+ ident_len > len - sizeof(*hdr) - 3 ||
+ skb->data[sizeof(*hdr) + ident_len + 1] != 0x00) {
+ err = -EINVAL;
+ goto drop;
+ }
+ } else {
+ err = -EINVAL;
+ goto drop;
+ }
+
+ index = __le16_to_cpu(hdr->index);
+
+ if (index != MGMT_INDEX_NONE) {
+ hdev = hci_dev_get(index);
+ if (!hdev) {
+ err = -ENODEV;
+ goto drop;
+ }
+ } else {
+ hdev = NULL;
+ }
+
+ hdr->opcode = cpu_to_le16(HCI_MON_USER_LOGGING);
+
+ hci_send_to_channel(HCI_CHANNEL_MONITOR, skb, HCI_SOCK_TRUSTED, NULL);
+ err = len;
+
+ if (hdev)
+ hci_dev_put(hdev);
+
+drop:
+ kfree_skb(skb);
+ return err;
+}
+
static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
size_t len)
{
@@ -1179,6 +1316,9 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
case HCI_CHANNEL_MONITOR:
err = -EOPNOTSUPP;
goto done;
+ case HCI_CHANNEL_LOGGING:
+ err = hci_logging_frame(sk, msg, len);
+ goto done;
default:
mutex_lock(&mgmt_chan_list_lock);
chan = __hci_mgmt_chan_find(hci_pi(sk)->channel);
@@ -1211,7 +1351,7 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
goto drop;
}
- bt_cb(skb)->pkt_type = *((unsigned char *) skb->data);
+ hci_skb_pkt_type(skb) = skb->data[0];
skb_pull(skb, 1);
if (hci_pi(sk)->channel == HCI_CHANNEL_USER) {
@@ -1220,16 +1360,16 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
*
* However check that the packet type is valid.
*/
- if (bt_cb(skb)->pkt_type != HCI_COMMAND_PKT &&
- bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT &&
- bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) {
+ if (hci_skb_pkt_type(skb) != HCI_COMMAND_PKT &&
+ hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) {
err = -EINVAL;
goto drop;
}
skb_queue_tail(&hdev->raw_q, skb);
queue_work(hdev->workqueue, &hdev->tx_work);
- } else if (bt_cb(skb)->pkt_type == HCI_COMMAND_PKT) {
+ } else if (hci_skb_pkt_type(skb) == HCI_COMMAND_PKT) {
u16 opcode = get_unaligned_le16(skb->data);
u16 ogf = hci_opcode_ogf(opcode);
u16 ocf = hci_opcode_ocf(opcode);
@@ -1242,6 +1382,11 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
goto drop;
}
+ /* Since the opcode has already been extracted here, store
+ * a copy of the value for later use by the drivers.
+ */
+ hci_skb_opcode(skb) = opcode;
+
if (ogf == 0x3f) {
skb_queue_tail(&hdev->raw_q, skb);
queue_work(hdev->workqueue, &hdev->tx_work);
@@ -1249,7 +1394,7 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
/* Stand-alone HCI commands must be flagged as
* single-command requests.
*/
- bt_cb(skb)->hci.req_start = true;
+ bt_cb(skb)->hci.req_flags |= HCI_REQ_START;
skb_queue_tail(&hdev->cmd_q, skb);
queue_work(hdev->workqueue, &hdev->cmd_work);
@@ -1260,8 +1405,8 @@ static int hci_sock_sendmsg(struct socket *sock, struct msghdr *msg,
goto drop;
}
- if (bt_cb(skb)->pkt_type != HCI_ACLDATA_PKT &&
- bt_cb(skb)->pkt_type != HCI_SCODATA_PKT) {
+ if (hci_skb_pkt_type(skb) != HCI_ACLDATA_PKT &&
+ hci_skb_pkt_type(skb) != HCI_SCODATA_PKT) {
err = -EINVAL;
goto drop;
}
diff --git a/net/bluetooth/l2cap_core.c b/net/bluetooth/l2cap_core.c
index 7c65ee200c29..eb4f5f24cbe3 100644
--- a/net/bluetooth/l2cap_core.c
+++ b/net/bluetooth/l2cap_core.c
@@ -197,10 +197,20 @@ int l2cap_add_psm(struct l2cap_chan *chan, bdaddr_t *src, __le16 psm)
chan->sport = psm;
err = 0;
} else {
- u16 p;
+ u16 p, start, end, incr;
+
+ if (chan->src_type == BDADDR_BREDR) {
+ start = L2CAP_PSM_DYN_START;
+ end = L2CAP_PSM_AUTO_END;
+ incr = 2;
+ } else {
+ start = L2CAP_PSM_LE_DYN_START;
+ end = L2CAP_PSM_LE_DYN_END;
+ incr = 1;
+ }
err = -EINVAL;
- for (p = 0x1001; p < 0x1100; p += 2)
+ for (p = start; p <= end; p += incr)
if (!__l2cap_global_chan_by_addr(cpu_to_le16(p), src)) {
chan->psm = cpu_to_le16(p);
chan->sport = cpu_to_le16(p);
@@ -239,7 +249,7 @@ static u16 l2cap_alloc_cid(struct l2cap_conn *conn)
else
dyn_end = L2CAP_CID_DYN_END;
- for (cid = L2CAP_CID_DYN_START; cid < dyn_end; cid++) {
+ for (cid = L2CAP_CID_DYN_START; cid <= dyn_end; cid++) {
if (!__l2cap_get_chan_by_scid(conn, cid))
return cid;
}
@@ -5250,7 +5260,9 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
credits = __le16_to_cpu(rsp->credits);
result = __le16_to_cpu(rsp->result);
- if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23))
+ if (result == L2CAP_CR_SUCCESS && (mtu < 23 || mps < 23 ||
+ dcid < L2CAP_CID_DYN_START ||
+ dcid > L2CAP_CID_LE_DYN_END))
return -EPROTO;
BT_DBG("dcid 0x%4.4x mtu %u mps %u credits %u result 0x%2.2x",
@@ -5270,6 +5282,11 @@ static int l2cap_le_connect_rsp(struct l2cap_conn *conn,
switch (result) {
case L2CAP_CR_SUCCESS:
+ if (__l2cap_get_chan_by_dcid(conn, dcid)) {
+ err = -EBADSLT;
+ break;
+ }
+
chan->ident = 0;
chan->dcid = dcid;
chan->omtu = mtu;
@@ -5437,9 +5454,16 @@ static int l2cap_le_connect_req(struct l2cap_conn *conn,
goto response_unlock;
}
+ /* Check for valid dynamic CID range */
+ if (scid < L2CAP_CID_DYN_START || scid > L2CAP_CID_LE_DYN_END) {
+ result = L2CAP_CR_INVALID_SCID;
+ chan = NULL;
+ goto response_unlock;
+ }
+
/* Check if we already have channel with that dcid */
if (__l2cap_get_chan_by_dcid(conn, scid)) {
- result = L2CAP_CR_NO_MEM;
+ result = L2CAP_CR_SCID_IN_USE;
chan = NULL;
goto response_unlock;
}
@@ -6524,8 +6548,6 @@ static int l2cap_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
struct sk_buff *skb)
{
- int err = 0;
-
BT_DBG("chan %p, control %p, skb %p, state %d", chan, control, skb,
chan->rx_state);
@@ -6556,7 +6578,7 @@ static int l2cap_stream_rx(struct l2cap_chan *chan, struct l2cap_ctrl *control,
chan->last_acked_seq = control->txseq;
chan->expected_tx_seq = __next_seq(chan, control->txseq);
- return err;
+ return 0;
}
static int l2cap_data_rcv(struct l2cap_chan *chan, struct sk_buff *skb)
@@ -7099,8 +7121,6 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
chan->dcid = cid;
if (bdaddr_type_is_le(dst_type)) {
- u8 role;
-
/* Convert from L2CAP channel address type to HCI address type
*/
if (dst_type == BDADDR_LE_PUBLIC)
@@ -7109,14 +7129,15 @@ int l2cap_chan_connect(struct l2cap_chan *chan, __le16 psm, u16 cid,
dst_type = ADDR_LE_DEV_RANDOM;
if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
- role = HCI_ROLE_SLAVE;
+ hcon = hci_connect_le(hdev, dst, dst_type,
+ chan->sec_level,
+ HCI_LE_CONN_TIMEOUT,
+ HCI_ROLE_SLAVE);
else
- role = HCI_ROLE_MASTER;
+ hcon = hci_connect_le_scan(hdev, dst, dst_type,
+ chan->sec_level,
+ HCI_LE_CONN_TIMEOUT);
- hcon = hci_connect_le_scan(hdev, dst, dst_type,
- chan->sec_level,
- HCI_LE_CONN_TIMEOUT,
- role);
} else {
u8 auth_type = l2cap_get_auth_type(chan);
hcon = hci_connect_acl(hdev, dst, chan->sec_level, auth_type);
diff --git a/net/bluetooth/l2cap_sock.c b/net/bluetooth/l2cap_sock.c
index 1bb551527044..e4cae72895a7 100644
--- a/net/bluetooth/l2cap_sock.c
+++ b/net/bluetooth/l2cap_sock.c
@@ -58,7 +58,7 @@ static int l2cap_validate_bredr_psm(u16 psm)
return -EINVAL;
/* Restrict usage of well-known PSMs */
- if (psm < 0x1001 && !capable(CAP_NET_BIND_SERVICE))
+ if (psm < L2CAP_PSM_DYN_START && !capable(CAP_NET_BIND_SERVICE))
return -EACCES;
return 0;
@@ -67,11 +67,11 @@ static int l2cap_validate_bredr_psm(u16 psm)
static int l2cap_validate_le_psm(u16 psm)
{
/* Valid LE_PSM ranges are defined only until 0x00ff */
- if (psm > 0x00ff)
+ if (psm > L2CAP_PSM_LE_DYN_END)
return -EINVAL;
/* Restrict fixed, SIG assigned PSM values to CAP_NET_BIND_SERVICE */
- if (psm <= 0x007f && !capable(CAP_NET_BIND_SERVICE))
+ if (psm < L2CAP_PSM_LE_DYN_START && !capable(CAP_NET_BIND_SERVICE))
return -EACCES;
return 0;
@@ -125,6 +125,9 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
goto done;
}
+ bacpy(&chan->src, &la.l2_bdaddr);
+ chan->src_type = la.l2_bdaddr_type;
+
if (la.l2_cid)
err = l2cap_add_scid(chan, __le16_to_cpu(la.l2_cid));
else
@@ -156,9 +159,6 @@ static int l2cap_sock_bind(struct socket *sock, struct sockaddr *addr, int alen)
break;
}
- bacpy(&chan->src, &la.l2_bdaddr);
- chan->src_type = la.l2_bdaddr_type;
-
if (chan->psm && bdaddr_type_is_le(chan->src_type))
chan->mode = L2CAP_MODE_LE_FLOWCTL;
diff --git a/net/bluetooth/leds.c b/net/bluetooth/leds.c
new file mode 100644
index 000000000000..8319c8440c89
--- /dev/null
+++ b/net/bluetooth/leds.c
@@ -0,0 +1,74 @@
+/*
+ * Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <net/bluetooth/bluetooth.h>
+#include <net/bluetooth/hci_core.h>
+
+#include "leds.h"
+
+struct hci_basic_led_trigger {
+ struct led_trigger led_trigger;
+ struct hci_dev *hdev;
+};
+
+#define to_hci_basic_led_trigger(arg) container_of(arg, \
+ struct hci_basic_led_trigger, led_trigger)
+
+void hci_leds_update_powered(struct hci_dev *hdev, bool enabled)
+{
+ if (hdev->power_led)
+ led_trigger_event(hdev->power_led,
+ enabled ? LED_FULL : LED_OFF);
+}
+
+static void power_activate(struct led_classdev *led_cdev)
+{
+ struct hci_basic_led_trigger *htrig;
+ bool powered;
+
+ htrig = to_hci_basic_led_trigger(led_cdev->trigger);
+ powered = test_bit(HCI_UP, &htrig->hdev->flags);
+
+ led_trigger_event(led_cdev->trigger, powered ? LED_FULL : LED_OFF);
+}
+
+static struct led_trigger *led_allocate_basic(struct hci_dev *hdev,
+ void (*activate)(struct led_classdev *led_cdev),
+ const char *name)
+{
+ struct hci_basic_led_trigger *htrig;
+
+ htrig = devm_kzalloc(&hdev->dev, sizeof(*htrig), GFP_KERNEL);
+ if (!htrig)
+ return NULL;
+
+ htrig->hdev = hdev;
+ htrig->led_trigger.activate = activate;
+ htrig->led_trigger.name = devm_kasprintf(&hdev->dev, GFP_KERNEL,
+ "%s-%s", hdev->name,
+ name);
+ if (!htrig->led_trigger.name)
+ goto err_alloc;
+
+ if (devm_led_trigger_register(&hdev->dev, &htrig->led_trigger))
+ goto err_register;
+
+ return &htrig->led_trigger;
+
+err_register:
+ devm_kfree(&hdev->dev, (void *)htrig->led_trigger.name);
+err_alloc:
+ devm_kfree(&hdev->dev, htrig);
+ return NULL;
+}
+
+void hci_leds_init(struct hci_dev *hdev)
+{
+ /* initialize power_led */
+ hdev->power_led = led_allocate_basic(hdev, power_activate, "power");
+}
diff --git a/net/bluetooth/leds.h b/net/bluetooth/leds.h
new file mode 100644
index 000000000000..a9c4d6ea01cf
--- /dev/null
+++ b/net/bluetooth/leds.h
@@ -0,0 +1,16 @@
+/*
+ * Copyright 2015, Heiner Kallweit <hkallweit1@gmail.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#if IS_ENABLED(CONFIG_BT_LEDS)
+void hci_leds_update_powered(struct hci_dev *hdev, bool enabled);
+void hci_leds_init(struct hci_dev *hdev);
+#else
+static inline void hci_leds_update_powered(struct hci_dev *hdev,
+ bool enabled) {}
+static inline void hci_leds_init(struct hci_dev *hdev) {}
+#endif
diff --git a/net/bluetooth/mgmt.c b/net/bluetooth/mgmt.c
index 7f22119276f3..9e4b931588cf 100644
--- a/net/bluetooth/mgmt.c
+++ b/net/bluetooth/mgmt.c
@@ -38,7 +38,7 @@
#include "mgmt_util.h"
#define MGMT_VERSION 1
-#define MGMT_REVISION 10
+#define MGMT_REVISION 12
static const u16 mgmt_commands[] = {
MGMT_OP_READ_INDEX_LIST,
@@ -102,6 +102,8 @@ static const u16 mgmt_commands[] = {
MGMT_OP_READ_ADV_FEATURES,
MGMT_OP_ADD_ADVERTISING,
MGMT_OP_REMOVE_ADVERTISING,
+ MGMT_OP_GET_ADV_SIZE_INFO,
+ MGMT_OP_START_LIMITED_DISCOVERY,
};
static const u16 mgmt_events[] = {
@@ -718,116 +720,6 @@ static u32 get_current_settings(struct hci_dev *hdev)
return settings;
}
-#define PNP_INFO_SVCLASS_ID 0x1200
-
-static u8 *create_uuid16_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
-{
- u8 *ptr = data, *uuids_start = NULL;
- struct bt_uuid *uuid;
-
- if (len < 4)
- return ptr;
-
- list_for_each_entry(uuid, &hdev->uuids, list) {
- u16 uuid16;
-
- if (uuid->size != 16)
- continue;
-
- uuid16 = get_unaligned_le16(&uuid->uuid[12]);
- if (uuid16 < 0x1100)
- continue;
-
- if (uuid16 == PNP_INFO_SVCLASS_ID)
- continue;
-
- if (!uuids_start) {
- uuids_start = ptr;
- uuids_start[0] = 1;
- uuids_start[1] = EIR_UUID16_ALL;
- ptr += 2;
- }
-
- /* Stop if not enough space to put next UUID */
- if ((ptr - data) + sizeof(u16) > len) {
- uuids_start[1] = EIR_UUID16_SOME;
- break;
- }
-
- *ptr++ = (uuid16 & 0x00ff);
- *ptr++ = (uuid16 & 0xff00) >> 8;
- uuids_start[0] += sizeof(uuid16);
- }
-
- return ptr;
-}
-
-static u8 *create_uuid32_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
-{
- u8 *ptr = data, *uuids_start = NULL;
- struct bt_uuid *uuid;
-
- if (len < 6)
- return ptr;
-
- list_for_each_entry(uuid, &hdev->uuids, list) {
- if (uuid->size != 32)
- continue;
-
- if (!uuids_start) {
- uuids_start = ptr;
- uuids_start[0] = 1;
- uuids_start[1] = EIR_UUID32_ALL;
- ptr += 2;
- }
-
- /* Stop if not enough space to put next UUID */
- if ((ptr - data) + sizeof(u32) > len) {
- uuids_start[1] = EIR_UUID32_SOME;
- break;
- }
-
- memcpy(ptr, &uuid->uuid[12], sizeof(u32));
- ptr += sizeof(u32);
- uuids_start[0] += sizeof(u32);
- }
-
- return ptr;
-}
-
-static u8 *create_uuid128_list(struct hci_dev *hdev, u8 *data, ptrdiff_t len)
-{
- u8 *ptr = data, *uuids_start = NULL;
- struct bt_uuid *uuid;
-
- if (len < 18)
- return ptr;
-
- list_for_each_entry(uuid, &hdev->uuids, list) {
- if (uuid->size != 128)
- continue;
-
- if (!uuids_start) {
- uuids_start = ptr;
- uuids_start[0] = 1;
- uuids_start[1] = EIR_UUID128_ALL;
- ptr += 2;
- }
-
- /* Stop if not enough space to put next UUID */
- if ((ptr - data) + 16 > len) {
- uuids_start[1] = EIR_UUID128_SOME;
- break;
- }
-
- memcpy(ptr, uuid->uuid, 16);
- ptr += 16;
- uuids_start[0] += 16;
- }
-
- return ptr;
-}
-
static struct mgmt_pending_cmd *pending_find(u16 opcode, struct hci_dev *hdev)
{
return mgmt_pending_find(HCI_CHANNEL_CONTROL, opcode, hdev);
@@ -840,98 +732,7 @@ static struct mgmt_pending_cmd *pending_find_data(u16 opcode,
return mgmt_pending_find_data(HCI_CHANNEL_CONTROL, opcode, hdev, data);
}
-static u8 get_current_adv_instance(struct hci_dev *hdev)
-{
- /* The "Set Advertising" setting supersedes the "Add Advertising"
- * setting. Here we set the advertising data based on which
- * setting was set. When neither apply, default to the global settings,
- * represented by instance "0".
- */
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) &&
- !hci_dev_test_flag(hdev, HCI_ADVERTISING))
- return hdev->cur_adv_instance;
-
- return 0x00;
-}
-
-static u8 create_default_scan_rsp_data(struct hci_dev *hdev, u8 *ptr)
-{
- u8 ad_len = 0;
- size_t name_len;
-
- name_len = strlen(hdev->dev_name);
- if (name_len > 0) {
- size_t max_len = HCI_MAX_AD_LENGTH - ad_len - 2;
-
- if (name_len > max_len) {
- name_len = max_len;
- ptr[1] = EIR_NAME_SHORT;
- } else
- ptr[1] = EIR_NAME_COMPLETE;
-
- ptr[0] = name_len + 1;
-
- memcpy(ptr + 2, hdev->dev_name, name_len);
-
- ad_len += (name_len + 2);
- ptr += (name_len + 2);
- }
-
- return ad_len;
-}
-
-static u8 create_instance_scan_rsp_data(struct hci_dev *hdev, u8 instance,
- u8 *ptr)
-{
- struct adv_info *adv_instance;
-
- adv_instance = hci_find_adv_instance(hdev, instance);
- if (!adv_instance)
- return 0;
-
- /* TODO: Set the appropriate entries based on advertising instance flags
- * here once flags other than 0 are supported.
- */
- memcpy(ptr, adv_instance->scan_rsp_data,
- adv_instance->scan_rsp_len);
-
- return adv_instance->scan_rsp_len;
-}
-
-static void update_inst_scan_rsp_data(struct hci_request *req, u8 instance)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_le_set_scan_rsp_data cp;
- u8 len;
-
- if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
- return;
-
- memset(&cp, 0, sizeof(cp));
-
- if (instance)
- len = create_instance_scan_rsp_data(hdev, instance, cp.data);
- else
- len = create_default_scan_rsp_data(hdev, cp.data);
-
- if (hdev->scan_rsp_data_len == len &&
- !memcmp(cp.data, hdev->scan_rsp_data, len))
- return;
-
- memcpy(hdev->scan_rsp_data, cp.data, sizeof(cp.data));
- hdev->scan_rsp_data_len = len;
-
- cp.length = len;
-
- hci_req_add(req, HCI_OP_LE_SET_SCAN_RSP_DATA, sizeof(cp), &cp);
-}
-
-static void update_scan_rsp_data(struct hci_request *req)
-{
- update_inst_scan_rsp_data(req, get_current_adv_instance(req->hdev));
-}
-
-static u8 get_adv_discov_flags(struct hci_dev *hdev)
+u8 mgmt_get_adv_discov_flags(struct hci_dev *hdev)
{
struct mgmt_pending_cmd *cmd;
@@ -955,7 +756,7 @@ static u8 get_adv_discov_flags(struct hci_dev *hdev)
return 0;
}
-static bool get_connectable(struct hci_dev *hdev)
+bool mgmt_get_connectable(struct hci_dev *hdev)
{
struct mgmt_pending_cmd *cmd;
@@ -972,344 +773,6 @@ static bool get_connectable(struct hci_dev *hdev)
return hci_dev_test_flag(hdev, HCI_CONNECTABLE);
}
-static u32 get_adv_instance_flags(struct hci_dev *hdev, u8 instance)
-{
- u32 flags;
- struct adv_info *adv_instance;
-
- if (instance == 0x00) {
- /* Instance 0 always manages the "Tx Power" and "Flags"
- * fields
- */
- flags = MGMT_ADV_FLAG_TX_POWER | MGMT_ADV_FLAG_MANAGED_FLAGS;
-
- /* For instance 0, the HCI_ADVERTISING_CONNECTABLE setting
- * corresponds to the "connectable" instance flag.
- */
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING_CONNECTABLE))
- flags |= MGMT_ADV_FLAG_CONNECTABLE;
-
- return flags;
- }
-
- adv_instance = hci_find_adv_instance(hdev, instance);
-
- /* Return 0 when we got an invalid instance identifier. */
- if (!adv_instance)
- return 0;
-
- return adv_instance->flags;
-}
-
-static u8 get_cur_adv_instance_scan_rsp_len(struct hci_dev *hdev)
-{
- u8 instance = get_current_adv_instance(hdev);
- struct adv_info *adv_instance;
-
- /* Ignore instance 0 */
- if (instance == 0x00)
- return 0;
-
- adv_instance = hci_find_adv_instance(hdev, instance);
- if (!adv_instance)
- return 0;
-
- /* TODO: Take into account the "appearance" and "local-name" flags here.
- * These are currently being ignored as they are not supported.
- */
- return adv_instance->scan_rsp_len;
-}
-
-static u8 create_instance_adv_data(struct hci_dev *hdev, u8 instance, u8 *ptr)
-{
- struct adv_info *adv_instance = NULL;
- u8 ad_len = 0, flags = 0;
- u32 instance_flags;
-
- /* Return 0 when the current instance identifier is invalid. */
- if (instance) {
- adv_instance = hci_find_adv_instance(hdev, instance);
- if (!adv_instance)
- return 0;
- }
-
- instance_flags = get_adv_instance_flags(hdev, instance);
-
- /* The Add Advertising command allows userspace to set both the general
- * and limited discoverable flags.
- */
- if (instance_flags & MGMT_ADV_FLAG_DISCOV)
- flags |= LE_AD_GENERAL;
-
- if (instance_flags & MGMT_ADV_FLAG_LIMITED_DISCOV)
- flags |= LE_AD_LIMITED;
-
- if (flags || (instance_flags & MGMT_ADV_FLAG_MANAGED_FLAGS)) {
- /* If a discovery flag wasn't provided, simply use the global
- * settings.
- */
- if (!flags)
- flags |= get_adv_discov_flags(hdev);
-
- if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
- flags |= LE_AD_NO_BREDR;
-
- /* If flags would still be empty, then there is no need to
- * include the "Flags" AD field".
- */
- if (flags) {
- ptr[0] = 0x02;
- ptr[1] = EIR_FLAGS;
- ptr[2] = flags;
-
- ad_len += 3;
- ptr += 3;
- }
- }
-
- if (adv_instance) {
- memcpy(ptr, adv_instance->adv_data,
- adv_instance->adv_data_len);
- ad_len += adv_instance->adv_data_len;
- ptr += adv_instance->adv_data_len;
- }
-
- /* Provide Tx Power only if we can provide a valid value for it */
- if (hdev->adv_tx_power != HCI_TX_POWER_INVALID &&
- (instance_flags & MGMT_ADV_FLAG_TX_POWER)) {
- ptr[0] = 0x02;
- ptr[1] = EIR_TX_POWER;
- ptr[2] = (u8)hdev->adv_tx_power;
-
- ad_len += 3;
- ptr += 3;
- }
-
- return ad_len;
-}
-
-static void update_inst_adv_data(struct hci_request *req, u8 instance)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_le_set_adv_data cp;
- u8 len;
-
- if (!hci_dev_test_flag(hdev, HCI_LE_ENABLED))
- return;
-
- memset(&cp, 0, sizeof(cp));
-
- len = create_instance_adv_data(hdev, instance, cp.data);
-
- /* There's nothing to do if the data hasn't changed */
- if (hdev->adv_data_len == len &&
- memcmp(cp.data, hdev->adv_data, len) == 0)
- return;
-
- memcpy(hdev->adv_data, cp.data, sizeof(cp.data));
- hdev->adv_data_len = len;
-
- cp.length = len;
-
- hci_req_add(req, HCI_OP_LE_SET_ADV_DATA, sizeof(cp), &cp);
-}
-
-static void update_adv_data(struct hci_request *req)
-{
- update_inst_adv_data(req, get_current_adv_instance(req->hdev));
-}
-
-int mgmt_update_adv_data(struct hci_dev *hdev)
-{
- struct hci_request req;
-
- hci_req_init(&req, hdev);
- update_adv_data(&req);
-
- return hci_req_run(&req, NULL);
-}
-
-static void create_eir(struct hci_dev *hdev, u8 *data)
-{
- u8 *ptr = data;
- size_t name_len;
-
- name_len = strlen(hdev->dev_name);
-
- if (name_len > 0) {
- /* EIR Data type */
- if (name_len > 48) {
- name_len = 48;
- ptr[1] = EIR_NAME_SHORT;
- } else
- ptr[1] = EIR_NAME_COMPLETE;
-
- /* EIR Data length */
- ptr[0] = name_len + 1;
-
- memcpy(ptr + 2, hdev->dev_name, name_len);
-
- ptr += (name_len + 2);
- }
-
- if (hdev->inq_tx_power != HCI_TX_POWER_INVALID) {
- ptr[0] = 2;
- ptr[1] = EIR_TX_POWER;
- ptr[2] = (u8) hdev->inq_tx_power;
-
- ptr += 3;
- }
-
- if (hdev->devid_source > 0) {
- ptr[0] = 9;
- ptr[1] = EIR_DEVICE_ID;
-
- put_unaligned_le16(hdev->devid_source, ptr + 2);
- put_unaligned_le16(hdev->devid_vendor, ptr + 4);
- put_unaligned_le16(hdev->devid_product, ptr + 6);
- put_unaligned_le16(hdev->devid_version, ptr + 8);
-
- ptr += 10;
- }
-
- ptr = create_uuid16_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
- ptr = create_uuid32_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
- ptr = create_uuid128_list(hdev, ptr, HCI_MAX_EIR_LENGTH - (ptr - data));
-}
-
-static void update_eir(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_write_eir cp;
-
- if (!hdev_is_powered(hdev))
- return;
-
- if (!lmp_ext_inq_capable(hdev))
- return;
-
- if (!hci_dev_test_flag(hdev, HCI_SSP_ENABLED))
- return;
-
- if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE))
- return;
-
- memset(&cp, 0, sizeof(cp));
-
- create_eir(hdev, cp.data);
-
- if (memcmp(cp.data, hdev->eir, sizeof(cp.data)) == 0)
- return;
-
- memcpy(hdev->eir, cp.data, sizeof(cp.data));
-
- hci_req_add(req, HCI_OP_WRITE_EIR, sizeof(cp), &cp);
-}
-
-static u8 get_service_classes(struct hci_dev *hdev)
-{
- struct bt_uuid *uuid;
- u8 val = 0;
-
- list_for_each_entry(uuid, &hdev->uuids, list)
- val |= uuid->svc_hint;
-
- return val;
-}
-
-static void update_class(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- u8 cod[3];
-
- BT_DBG("%s", hdev->name);
-
- if (!hdev_is_powered(hdev))
- return;
-
- if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
- return;
-
- if (hci_dev_test_flag(hdev, HCI_SERVICE_CACHE))
- return;
-
- cod[0] = hdev->minor_class;
- cod[1] = hdev->major_class;
- cod[2] = get_service_classes(hdev);
-
- if (hci_dev_test_flag(hdev, HCI_LIMITED_DISCOVERABLE))
- cod[1] |= 0x20;
-
- if (memcmp(cod, hdev->dev_class, 3) == 0)
- return;
-
- hci_req_add(req, HCI_OP_WRITE_CLASS_OF_DEV, sizeof(cod), cod);
-}
-
-static void disable_advertising(struct hci_request *req)
-{
- u8 enable = 0x00;
-
- hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
-}
-
-static void enable_advertising(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_le_set_adv_param cp;
- u8 own_addr_type, enable = 0x01;
- bool connectable;
- u8 instance;
- u32 flags;
-
- if (hci_conn_num(hdev, LE_LINK) > 0)
- return;
-
- if (hci_dev_test_flag(hdev, HCI_LE_ADV))
- disable_advertising(req);
-
- /* Clear the HCI_LE_ADV bit temporarily so that the
- * hci_update_random_address knows that it's safe to go ahead
- * and write a new random address. The flag will be set back on
- * as soon as the SET_ADV_ENABLE HCI command completes.
- */
- hci_dev_clear_flag(hdev, HCI_LE_ADV);
-
- instance = get_current_adv_instance(hdev);
- flags = get_adv_instance_flags(hdev, instance);
-
- /* If the "connectable" instance flag was not set, then choose between
- * ADV_IND and ADV_NONCONN_IND based on the global connectable setting.
- */
- connectable = (flags & MGMT_ADV_FLAG_CONNECTABLE) ||
- get_connectable(hdev);
-
- /* Set require_privacy to true only when non-connectable
- * advertising is used. In that case it is fine to use a
- * non-resolvable private address.
- */
- if (hci_update_random_address(req, !connectable, &own_addr_type) < 0)
- return;
-
- memset(&cp, 0, sizeof(cp));
- cp.min_interval = cpu_to_le16(hdev->le_adv_min_interval);
- cp.max_interval = cpu_to_le16(hdev->le_adv_max_interval);
-
- if (connectable)
- cp.type = LE_ADV_IND;
- else if (get_cur_adv_instance_scan_rsp_len(hdev))
- cp.type = LE_ADV_SCAN_IND;
- else
- cp.type = LE_ADV_NONCONN_IND;
-
- cp.own_address_type = own_addr_type;
- cp.channel_map = hdev->le_adv_channel_map;
-
- hci_req_add(req, HCI_OP_LE_SET_ADV_PARAM, sizeof(cp), &cp);
-
- hci_req_add(req, HCI_OP_LE_SET_ADV_ENABLE, sizeof(enable), &enable);
-}
-
static void service_cache_off(struct work_struct *work)
{
struct hci_dev *hdev = container_of(work, struct hci_dev,
@@ -1323,8 +786,8 @@ static void service_cache_off(struct work_struct *work)
hci_dev_lock(hdev);
- update_eir(&req);
- update_class(&req);
+ __hci_req_update_eir(&req);
+ __hci_req_update_class(&req);
hci_dev_unlock(hdev);
@@ -1345,10 +808,11 @@ static void rpa_expired(struct work_struct *work)
return;
/* The generation of a new RPA and programming it into the
- * controller happens in the enable_advertising() function.
+ * controller happens in the hci_req_enable_advertising()
+ * function.
*/
hci_req_init(&req, hdev);
- enable_advertising(&req);
+ __hci_req_enable_advertising(&req);
hci_req_run(&req, NULL);
}
@@ -1416,51 +880,7 @@ static void clean_up_hci_complete(struct hci_dev *hdev, u8 status, u16 opcode)
}
}
-static bool hci_stop_discovery(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_remote_name_req_cancel cp;
- struct inquiry_entry *e;
-
- switch (hdev->discovery.state) {
- case DISCOVERY_FINDING:
- if (test_bit(HCI_INQUIRY, &hdev->flags))
- hci_req_add(req, HCI_OP_INQUIRY_CANCEL, 0, NULL);
-
- if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
- cancel_delayed_work(&hdev->le_scan_disable);
- hci_req_add_le_scan_disable(req);
- }
-
- return true;
-
- case DISCOVERY_RESOLVING:
- e = hci_inquiry_cache_lookup_resolve(hdev, BDADDR_ANY,
- NAME_PENDING);
- if (!e)
- break;
-
- bacpy(&cp.bdaddr, &e->data.bdaddr);
- hci_req_add(req, HCI_OP_REMOTE_NAME_REQ_CANCEL, sizeof(cp),
- &cp);
-
- return true;
-
- default:
- /* Passive scanning */
- if (hci_dev_test_flag(hdev, HCI_LE_SCAN)) {
- hci_req_add_le_scan_disable(req);
- return true;
- }
-
- break;
- }
-
- return false;
-}
-
-static void advertising_added(struct sock *sk, struct hci_dev *hdev,
- u8 instance)
+void mgmt_advertising_added(struct sock *sk, struct hci_dev *hdev, u8 instance)
{
struct mgmt_ev_advertising_added ev;
@@ -1469,8 +889,8 @@ static void advertising_added(struct sock *sk, struct hci_dev *hdev,
mgmt_event(MGMT_EV_ADVERTISING_ADDED, hdev, &ev, sizeof(ev), sk);
}
-static void advertising_removed(struct sock *sk, struct hci_dev *hdev,
- u8 instance)
+void mgmt_advertising_removed(struct sock *sk, struct hci_dev *hdev,
+ u8 instance)
{
struct mgmt_ev_advertising_removed ev;
@@ -1479,65 +899,6 @@ static void advertising_removed(struct sock *sk, struct hci_dev *hdev,
mgmt_event(MGMT_EV_ADVERTISING_REMOVED, hdev, &ev, sizeof(ev), sk);
}
-static int schedule_adv_instance(struct hci_request *req, u8 instance,
- bool force) {
- struct hci_dev *hdev = req->hdev;
- struct adv_info *adv_instance = NULL;
- u16 timeout;
-
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
- !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))
- return -EPERM;
-
- if (hdev->adv_instance_timeout)
- return -EBUSY;
-
- adv_instance = hci_find_adv_instance(hdev, instance);
- if (!adv_instance)
- return -ENOENT;
-
- /* A zero timeout means unlimited advertising. As long as there is
- * only one instance, duration should be ignored. We still set a timeout
- * in case further instances are being added later on.
- *
- * If the remaining lifetime of the instance is more than the duration
- * then the timeout corresponds to the duration, otherwise it will be
- * reduced to the remaining instance lifetime.
- */
- if (adv_instance->timeout == 0 ||
- adv_instance->duration <= adv_instance->remaining_time)
- timeout = adv_instance->duration;
- else
- timeout = adv_instance->remaining_time;
-
- /* The remaining time is being reduced unless the instance is being
- * advertised without time limit.
- */
- if (adv_instance->timeout)
- adv_instance->remaining_time =
- adv_instance->remaining_time - timeout;
-
- hdev->adv_instance_timeout = timeout;
- queue_delayed_work(hdev->workqueue,
- &hdev->adv_instance_expire,
- msecs_to_jiffies(timeout * 1000));
-
- /* If we're just re-scheduling the same instance again then do not
- * execute any HCI commands. This happens when a single instance is
- * being advertised.
- */
- if (!force && hdev->cur_adv_instance == instance &&
- hci_dev_test_flag(hdev, HCI_LE_ADV))
- return 0;
-
- hdev->cur_adv_instance = instance;
- update_adv_data(req);
- update_scan_rsp_data(req);
- enable_advertising(req);
-
- return 0;
-}
-
static void cancel_adv_timeout(struct hci_dev *hdev)
{
if (hdev->adv_instance_timeout) {
@@ -1546,76 +907,6 @@ static void cancel_adv_timeout(struct hci_dev *hdev)
}
}
-/* For a single instance:
- * - force == true: The instance will be removed even when its remaining
- * lifetime is not zero.
- * - force == false: the instance will be deactivated but kept stored unless
- * the remaining lifetime is zero.
- *
- * For instance == 0x00:
- * - force == true: All instances will be removed regardless of their timeout
- * setting.
- * - force == false: Only instances that have a timeout will be removed.
- */
-static void clear_adv_instance(struct hci_dev *hdev, struct hci_request *req,
- u8 instance, bool force)
-{
- struct adv_info *adv_instance, *n, *next_instance = NULL;
- int err;
- u8 rem_inst;
-
- /* Cancel any timeout concerning the removed instance(s). */
- if (!instance || hdev->cur_adv_instance == instance)
- cancel_adv_timeout(hdev);
-
- /* Get the next instance to advertise BEFORE we remove
- * the current one. This can be the same instance again
- * if there is only one instance.
- */
- if (instance && hdev->cur_adv_instance == instance)
- next_instance = hci_get_next_instance(hdev, instance);
-
- if (instance == 0x00) {
- list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances,
- list) {
- if (!(force || adv_instance->timeout))
- continue;
-
- rem_inst = adv_instance->instance;
- err = hci_remove_adv_instance(hdev, rem_inst);
- if (!err)
- advertising_removed(NULL, hdev, rem_inst);
- }
- hdev->cur_adv_instance = 0x00;
- } else {
- adv_instance = hci_find_adv_instance(hdev, instance);
-
- if (force || (adv_instance && adv_instance->timeout &&
- !adv_instance->remaining_time)) {
- /* Don't advertise a removed instance. */
- if (next_instance &&
- next_instance->instance == instance)
- next_instance = NULL;
-
- err = hci_remove_adv_instance(hdev, instance);
- if (!err)
- advertising_removed(NULL, hdev, instance);
- }
- }
-
- if (list_empty(&hdev->adv_instances)) {
- hdev->cur_adv_instance = 0x00;
- hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE);
- }
-
- if (!req || !hdev_is_powered(hdev) ||
- hci_dev_test_flag(hdev, HCI_ADVERTISING))
- return;
-
- if (next_instance)
- schedule_adv_instance(req, next_instance->instance, false);
-}
-
static int clean_up_hci_state(struct hci_dev *hdev)
{
struct hci_request req;
@@ -1631,12 +922,12 @@ static int clean_up_hci_state(struct hci_dev *hdev)
hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
}
- clear_adv_instance(hdev, NULL, 0x00, false);
+ hci_req_clear_adv_instance(hdev, NULL, 0x00, false);
if (hci_dev_test_flag(hdev, HCI_LE_ADV))
- disable_advertising(&req);
+ __hci_req_disable_advertising(&req);
- discov_stopped = hci_stop_discovery(&req);
+ discov_stopped = hci_req_stop_discovery(&req);
list_for_each_entry(conn, &hdev->conn_hash.list, list) {
/* 0x15 == Terminated due to Power Off */
@@ -1671,17 +962,6 @@ static int set_powered(struct sock *sk, struct hci_dev *hdev, void *data,
goto failed;
}
- if (hci_dev_test_and_clear_flag(hdev, HCI_AUTO_OFF)) {
- cancel_delayed_work(&hdev->power_off);
-
- if (cp->val) {
- mgmt_pending_add(sk, MGMT_OP_SET_POWERED, hdev,
- data, len);
- err = mgmt_powered(hdev, 1);
- goto failed;
- }
- }
-
if (!!cp->val == hdev_is_powered(hdev)) {
err = send_settings_rsp(sk, MGMT_OP_SET_POWERED, hdev);
goto failed;
@@ -1805,13 +1085,9 @@ static u8 mgmt_le_support(struct hci_dev *hdev)
return MGMT_STATUS_SUCCESS;
}
-static void set_discoverable_complete(struct hci_dev *hdev, u8 status,
- u16 opcode)
+void mgmt_set_discoverable_complete(struct hci_dev *hdev, u8 status)
{
struct mgmt_pending_cmd *cmd;
- struct mgmt_mode *cp;
- struct hci_request req;
- bool changed;
BT_DBG("status 0x%02x", status);
@@ -1828,33 +1104,14 @@ static void set_discoverable_complete(struct hci_dev *hdev, u8 status,
goto remove_cmd;
}
- cp = cmd->param;
- if (cp->val) {
- changed = !hci_dev_test_and_set_flag(hdev, HCI_DISCOVERABLE);
-
- if (hdev->discov_timeout > 0) {
- int to = msecs_to_jiffies(hdev->discov_timeout * 1000);
- queue_delayed_work(hdev->workqueue, &hdev->discov_off,
- to);
- }
- } else {
- changed = hci_dev_test_and_clear_flag(hdev, HCI_DISCOVERABLE);
+ if (hci_dev_test_flag(hdev, HCI_DISCOVERABLE) &&
+ hdev->discov_timeout > 0) {
+ int to = msecs_to_jiffies(hdev->discov_timeout * 1000);
+ queue_delayed_work(hdev->req_workqueue, &hdev->discov_off, to);
}
send_settings_rsp(cmd->sk, MGMT_OP_SET_DISCOVERABLE, hdev);
-
- if (changed)
- new_settings(hdev, cmd->sk);
-
- /* When the discoverable mode gets changed, make sure
- * that class of device has the limited discoverable
- * bit correctly set. Also update page scan based on whitelist
- * entries.
- */
- hci_req_init(&req, hdev);
- __hci_update_page_scan(&req);
- update_class(&req);
- hci_req_run(&req, NULL);
+ new_settings(hdev, cmd->sk);
remove_cmd:
mgmt_pending_remove(cmd);
@@ -1868,9 +1125,7 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data,
{
struct mgmt_cp_set_discoverable *cp = data;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
u16 timeout;
- u8 scan;
int err;
BT_DBG("request for %s", hdev->name);
@@ -1949,8 +1204,8 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data,
if (cp->val && hdev->discov_timeout > 0) {
int to = msecs_to_jiffies(hdev->discov_timeout * 1000);
- queue_delayed_work(hdev->workqueue, &hdev->discov_off,
- to);
+ queue_delayed_work(hdev->req_workqueue,
+ &hdev->discov_off, to);
}
err = send_settings_rsp(sk, MGMT_OP_SET_DISCOVERABLE, hdev);
@@ -1970,105 +1225,28 @@ static int set_discoverable(struct sock *sk, struct hci_dev *hdev, void *data,
cancel_delayed_work(&hdev->discov_off);
hdev->discov_timeout = timeout;
+ if (cp->val)
+ hci_dev_set_flag(hdev, HCI_DISCOVERABLE);
+ else
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+
/* Limited discoverable mode */
if (cp->val == 0x02)
hci_dev_set_flag(hdev, HCI_LIMITED_DISCOVERABLE);
else
hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
- hci_req_init(&req, hdev);
-
- /* The procedure for LE-only controllers is much simpler - just
- * update the advertising data.
- */
- if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
- goto update_ad;
-
- scan = SCAN_PAGE;
-
- if (cp->val) {
- struct hci_cp_write_current_iac_lap hci_cp;
-
- if (cp->val == 0x02) {
- /* Limited discoverable mode */
- hci_cp.num_iac = min_t(u8, hdev->num_iac, 2);
- hci_cp.iac_lap[0] = 0x00; /* LIAC */
- hci_cp.iac_lap[1] = 0x8b;
- hci_cp.iac_lap[2] = 0x9e;
- hci_cp.iac_lap[3] = 0x33; /* GIAC */
- hci_cp.iac_lap[4] = 0x8b;
- hci_cp.iac_lap[5] = 0x9e;
- } else {
- /* General discoverable mode */
- hci_cp.num_iac = 1;
- hci_cp.iac_lap[0] = 0x33; /* GIAC */
- hci_cp.iac_lap[1] = 0x8b;
- hci_cp.iac_lap[2] = 0x9e;
- }
-
- hci_req_add(&req, HCI_OP_WRITE_CURRENT_IAC_LAP,
- (hci_cp.num_iac * 3) + 1, &hci_cp);
-
- scan |= SCAN_INQUIRY;
- } else {
- hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
- }
-
- hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, sizeof(scan), &scan);
-
-update_ad:
- update_adv_data(&req);
-
- err = hci_req_run(&req, set_discoverable_complete);
- if (err < 0)
- mgmt_pending_remove(cmd);
+ queue_work(hdev->req_workqueue, &hdev->discoverable_update);
+ err = 0;
failed:
hci_dev_unlock(hdev);
return err;
}
-static void write_fast_connectable(struct hci_request *req, bool enable)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_write_page_scan_activity acp;
- u8 type;
-
- if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
- return;
-
- if (hdev->hci_ver < BLUETOOTH_VER_1_2)
- return;
-
- if (enable) {
- type = PAGE_SCAN_TYPE_INTERLACED;
-
- /* 160 msec page scan interval */
- acp.interval = cpu_to_le16(0x0100);
- } else {
- type = PAGE_SCAN_TYPE_STANDARD; /* default */
-
- /* default 1.28 sec page scan */
- acp.interval = cpu_to_le16(0x0800);
- }
-
- acp.window = cpu_to_le16(0x0012);
-
- if (__cpu_to_le16(hdev->page_scan_interval) != acp.interval ||
- __cpu_to_le16(hdev->page_scan_window) != acp.window)
- hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_ACTIVITY,
- sizeof(acp), &acp);
-
- if (hdev->page_scan_type != type)
- hci_req_add(req, HCI_OP_WRITE_PAGE_SCAN_TYPE, 1, &type);
-}
-
-static void set_connectable_complete(struct hci_dev *hdev, u8 status,
- u16 opcode)
+void mgmt_set_connectable_complete(struct hci_dev *hdev, u8 status)
{
struct mgmt_pending_cmd *cmd;
- struct mgmt_mode *cp;
- bool conn_changed, discov_changed;
BT_DBG("status 0x%02x", status);
@@ -2084,27 +1262,8 @@ static void set_connectable_complete(struct hci_dev *hdev, u8 status,
goto remove_cmd;
}
- cp = cmd->param;
- if (cp->val) {
- conn_changed = !hci_dev_test_and_set_flag(hdev,
- HCI_CONNECTABLE);
- discov_changed = false;
- } else {
- conn_changed = hci_dev_test_and_clear_flag(hdev,
- HCI_CONNECTABLE);
- discov_changed = hci_dev_test_and_clear_flag(hdev,
- HCI_DISCOVERABLE);
- }
-
send_settings_rsp(cmd->sk, MGMT_OP_SET_CONNECTABLE, hdev);
-
- if (conn_changed || discov_changed) {
- new_settings(hdev, cmd->sk);
- hci_update_page_scan(hdev);
- if (discov_changed)
- mgmt_update_adv_data(hdev);
- hci_update_background_scan(hdev);
- }
+ new_settings(hdev, cmd->sk);
remove_cmd:
mgmt_pending_remove(cmd);
@@ -2134,7 +1293,7 @@ static int set_connectable_update_settings(struct hci_dev *hdev,
return err;
if (changed) {
- hci_update_page_scan(hdev);
+ hci_req_update_scan(hdev);
hci_update_background_scan(hdev);
return new_settings(hdev, sk);
}
@@ -2147,8 +1306,6 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data,
{
struct mgmt_mode *cp = data;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
- u8 scan;
int err;
BT_DBG("request for %s", hdev->name);
@@ -2182,57 +1339,19 @@ static int set_connectable(struct sock *sk, struct hci_dev *hdev, void *data,
goto failed;
}
- hci_req_init(&req, hdev);
-
- /* If BR/EDR is not enabled and we disable advertising as a
- * by-product of disabling connectable, we need to update the
- * advertising flags.
- */
- if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
- if (!cp->val) {
- hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
- hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
- }
- update_adv_data(&req);
- } else if (cp->val != test_bit(HCI_PSCAN, &hdev->flags)) {
- if (cp->val) {
- scan = SCAN_PAGE;
- } else {
- /* If we don't have any whitelist entries just
- * disable all scanning. If there are entries
- * and we had both page and inquiry scanning
- * enabled then fall back to only page scanning.
- * Otherwise no changes are needed.
- */
- if (list_empty(&hdev->whitelist))
- scan = SCAN_DISABLED;
- else if (test_bit(HCI_ISCAN, &hdev->flags))
- scan = SCAN_PAGE;
- else
- goto no_scan_update;
-
- if (test_bit(HCI_ISCAN, &hdev->flags) &&
- hdev->discov_timeout > 0)
- cancel_delayed_work(&hdev->discov_off);
- }
+ if (cp->val) {
+ hci_dev_set_flag(hdev, HCI_CONNECTABLE);
+ } else {
+ if (hdev->discov_timeout > 0)
+ cancel_delayed_work(&hdev->discov_off);
- hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE, 1, &scan);
+ hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
+ hci_dev_clear_flag(hdev, HCI_CONNECTABLE);
}
-no_scan_update:
- /* Update the advertising parameters if necessary */
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
- hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))
- enable_advertising(&req);
-
- err = hci_req_run(&req, set_connectable_complete);
- if (err < 0) {
- mgmt_pending_remove(cmd);
- if (err == -ENODATA)
- err = set_connectable_update_settings(hdev, sk,
- cp->val);
- goto failed;
- }
+ queue_work(hdev->req_workqueue, &hdev->connectable_update);
+ err = 0;
failed:
hci_dev_unlock(hdev);
@@ -2263,8 +1382,19 @@ static int set_bondable(struct sock *sk, struct hci_dev *hdev, void *data,
if (err < 0)
goto unlock;
- if (changed)
+ if (changed) {
+ /* In limited privacy mode the change of bondable mode
+ * may affect the local advertising address.
+ */
+ if (hdev_is_powered(hdev) &&
+ hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
+ hci_dev_test_flag(hdev, HCI_DISCOVERABLE) &&
+ hci_dev_test_flag(hdev, HCI_LIMITED_PRIVACY))
+ queue_work(hdev->req_workqueue,
+ &hdev->discoverable_update);
+
err = new_settings(hdev, sk);
+ }
unlock:
hci_dev_unlock(hdev);
@@ -2508,10 +1638,10 @@ static void le_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
struct hci_request req;
hci_req_init(&req, hdev);
- update_adv_data(&req);
- update_scan_rsp_data(&req);
- __hci_update_background_scan(&req);
+ __hci_req_update_adv_data(&req, 0x00);
+ __hci_req_update_scan_rsp_data(&req, 0x00);
hci_req_run(&req, NULL);
+ hci_update_background_scan(hdev);
}
unlock:
@@ -2560,7 +1690,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
enabled = lmp_host_le_capable(hdev);
if (!val)
- clear_adv_instance(hdev, NULL, 0x00, true);
+ hci_req_clear_adv_instance(hdev, NULL, 0x00, true);
if (!hdev_is_powered(hdev) || val == enabled) {
bool changed = false;
@@ -2607,7 +1737,7 @@ static int set_le(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
hci_cp.simul = 0x00;
} else {
if (hci_dev_test_flag(hdev, HCI_LE_ADV))
- disable_advertising(&req);
+ __hci_req_disable_advertising(&req);
}
hci_req_add(&req, HCI_OP_WRITE_LE_HOST_SUPPORTED, sizeof(hci_cp),
@@ -2722,8 +1852,8 @@ static int add_uuid(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
hci_req_init(&req, hdev);
- update_class(&req);
- update_eir(&req);
+ __hci_req_update_class(&req);
+ __hci_req_update_eir(&req);
err = hci_req_run(&req, add_uuid_complete);
if (err < 0) {
@@ -2822,8 +1952,8 @@ static int remove_uuid(struct sock *sk, struct hci_dev *hdev, void *data,
update_class:
hci_req_init(&req, hdev);
- update_class(&req);
- update_eir(&req);
+ __hci_req_update_class(&req);
+ __hci_req_update_eir(&req);
err = hci_req_run(&req, remove_uuid_complete);
if (err < 0) {
@@ -2898,10 +2028,10 @@ static int set_dev_class(struct sock *sk, struct hci_dev *hdev, void *data,
hci_dev_unlock(hdev);
cancel_delayed_work_sync(&hdev->service_cache);
hci_dev_lock(hdev);
- update_eir(&req);
+ __hci_req_update_eir(&req);
}
- update_class(&req);
+ __hci_req_update_class(&req);
err = hci_req_run(&req, set_class_complete);
if (err < 0) {
@@ -3561,8 +2691,7 @@ static int pair_device(struct sock *sk, struct hci_dev *hdev, void *data,
conn = hci_connect_le_scan(hdev, &cp->addr.bdaddr,
addr_type, sec_level,
- HCI_LE_CONN_TIMEOUT,
- HCI_ROLE_MASTER);
+ HCI_LE_CONN_TIMEOUT);
}
if (IS_ERR(conn)) {
@@ -3803,16 +2932,6 @@ static int user_passkey_neg_reply(struct sock *sk, struct hci_dev *hdev,
HCI_OP_USER_PASSKEY_NEG_REPLY, 0);
}
-static void update_name(struct hci_request *req)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_write_local_name cp;
-
- memcpy(cp.name, hdev->dev_name, sizeof(cp.name));
-
- hci_req_add(req, HCI_OP_WRITE_LOCAL_NAME, sizeof(cp), &cp);
-}
-
static void set_name_complete(struct hci_dev *hdev, u8 status, u16 opcode)
{
struct mgmt_cp_set_local_name *cp;
@@ -3891,15 +3010,15 @@ static int set_local_name(struct sock *sk, struct hci_dev *hdev, void *data,
hci_req_init(&req, hdev);
if (lmp_bredr_capable(hdev)) {
- update_name(&req);
- update_eir(&req);
+ __hci_req_update_name(&req);
+ __hci_req_update_eir(&req);
}
/* The name is stored in the scan response data and so
* no need to udpate the advertising data here.
*/
if (lmp_le_capable(hdev))
- update_scan_rsp_data(&req);
+ __hci_req_update_scan_rsp_data(&req, hdev->cur_adv_instance);
err = hci_req_run(&req, set_name_complete);
if (err < 0)
@@ -4164,145 +3283,9 @@ done:
return err;
}
-static bool trigger_bredr_inquiry(struct hci_request *req, u8 *status)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_inquiry cp;
- /* General inquiry access code (GIAC) */
- u8 lap[3] = { 0x33, 0x8b, 0x9e };
-
- *status = mgmt_bredr_support(hdev);
- if (*status)
- return false;
-
- if (hci_dev_test_flag(hdev, HCI_INQUIRY)) {
- *status = MGMT_STATUS_BUSY;
- return false;
- }
-
- hci_inquiry_cache_flush(hdev);
-
- memset(&cp, 0, sizeof(cp));
- memcpy(&cp.lap, lap, sizeof(cp.lap));
- cp.length = DISCOV_BREDR_INQUIRY_LEN;
-
- hci_req_add(req, HCI_OP_INQUIRY, sizeof(cp), &cp);
-
- return true;
-}
-
-static bool trigger_le_scan(struct hci_request *req, u16 interval, u8 *status)
-{
- struct hci_dev *hdev = req->hdev;
- struct hci_cp_le_set_scan_param param_cp;
- struct hci_cp_le_set_scan_enable enable_cp;
- u8 own_addr_type;
- int err;
-
- *status = mgmt_le_support(hdev);
- if (*status)
- return false;
-
- if (hci_dev_test_flag(hdev, HCI_LE_ADV)) {
- /* Don't let discovery abort an outgoing connection attempt
- * that's using directed advertising.
- */
- if (hci_lookup_le_connect(hdev)) {
- *status = MGMT_STATUS_REJECTED;
- return false;
- }
-
- cancel_adv_timeout(hdev);
- disable_advertising(req);
- }
-
- /* If controller is scanning, it means the background scanning is
- * running. Thus, we should temporarily stop it in order to set the
- * discovery scanning parameters.
- */
- if (hci_dev_test_flag(hdev, HCI_LE_SCAN))
- hci_req_add_le_scan_disable(req);
-
- /* All active scans will be done with either a resolvable private
- * address (when privacy feature has been enabled) or non-resolvable
- * private address.
- */
- err = hci_update_random_address(req, true, &own_addr_type);
- if (err < 0) {
- *status = MGMT_STATUS_FAILED;
- return false;
- }
-
- memset(&param_cp, 0, sizeof(param_cp));
- param_cp.type = LE_SCAN_ACTIVE;
- param_cp.interval = cpu_to_le16(interval);
- param_cp.window = cpu_to_le16(DISCOV_LE_SCAN_WIN);
- param_cp.own_address_type = own_addr_type;
-
- hci_req_add(req, HCI_OP_LE_SET_SCAN_PARAM, sizeof(param_cp),
- &param_cp);
-
- memset(&enable_cp, 0, sizeof(enable_cp));
- enable_cp.enable = LE_SCAN_ENABLE;
- enable_cp.filter_dup = LE_SCAN_FILTER_DUP_ENABLE;
-
- hci_req_add(req, HCI_OP_LE_SET_SCAN_ENABLE, sizeof(enable_cp),
- &enable_cp);
-
- return true;
-}
-
-static bool trigger_discovery(struct hci_request *req, u8 *status)
-{
- struct hci_dev *hdev = req->hdev;
-
- switch (hdev->discovery.type) {
- case DISCOV_TYPE_BREDR:
- if (!trigger_bredr_inquiry(req, status))
- return false;
- break;
-
- case DISCOV_TYPE_INTERLEAVED:
- if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY,
- &hdev->quirks)) {
- /* During simultaneous discovery, we double LE scan
- * interval. We must leave some time for the controller
- * to do BR/EDR inquiry.
- */
- if (!trigger_le_scan(req, DISCOV_LE_SCAN_INT * 2,
- status))
- return false;
-
- if (!trigger_bredr_inquiry(req, status))
- return false;
-
- return true;
- }
-
- if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
- *status = MGMT_STATUS_NOT_SUPPORTED;
- return false;
- }
- /* fall through */
-
- case DISCOV_TYPE_LE:
- if (!trigger_le_scan(req, DISCOV_LE_SCAN_INT, status))
- return false;
- break;
-
- default:
- *status = MGMT_STATUS_INVALID_PARAMS;
- return false;
- }
-
- return true;
-}
-
-static void start_discovery_complete(struct hci_dev *hdev, u8 status,
- u16 opcode)
+void mgmt_start_discovery_complete(struct hci_dev *hdev, u8 status)
{
struct mgmt_pending_cmd *cmd;
- unsigned long timeout;
BT_DBG("status %d", status);
@@ -4312,75 +3295,49 @@ static void start_discovery_complete(struct hci_dev *hdev, u8 status,
if (!cmd)
cmd = pending_find(MGMT_OP_START_SERVICE_DISCOVERY, hdev);
+ if (!cmd)
+ cmd = pending_find(MGMT_OP_START_LIMITED_DISCOVERY, hdev);
+
if (cmd) {
cmd->cmd_complete(cmd, mgmt_status(status));
mgmt_pending_remove(cmd);
}
- if (status) {
- hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
- goto unlock;
- }
-
- hci_discovery_set_state(hdev, DISCOVERY_FINDING);
+ hci_dev_unlock(hdev);
+}
- /* If the scan involves LE scan, pick proper timeout to schedule
- * hdev->le_scan_disable that will stop it.
- */
- switch (hdev->discovery.type) {
+static bool discovery_type_is_valid(struct hci_dev *hdev, uint8_t type,
+ uint8_t *mgmt_status)
+{
+ switch (type) {
case DISCOV_TYPE_LE:
- timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT);
+ *mgmt_status = mgmt_le_support(hdev);
+ if (*mgmt_status)
+ return false;
break;
case DISCOV_TYPE_INTERLEAVED:
- /* When running simultaneous discovery, the LE scanning time
- * should occupy the whole discovery time sine BR/EDR inquiry
- * and LE scanning are scheduled by the controller.
- *
- * For interleaving discovery in comparison, BR/EDR inquiry
- * and LE scanning are done sequentially with separate
- * timeouts.
- */
- if (test_bit(HCI_QUIRK_SIMULTANEOUS_DISCOVERY, &hdev->quirks))
- timeout = msecs_to_jiffies(DISCOV_LE_TIMEOUT);
- else
- timeout = msecs_to_jiffies(hdev->discov_interleaved_timeout);
- break;
+ *mgmt_status = mgmt_le_support(hdev);
+ if (*mgmt_status)
+ return false;
+ /* Intentional fall-through */
case DISCOV_TYPE_BREDR:
- timeout = 0;
+ *mgmt_status = mgmt_bredr_support(hdev);
+ if (*mgmt_status)
+ return false;
break;
default:
- BT_ERR("Invalid discovery type %d", hdev->discovery.type);
- timeout = 0;
- break;
- }
-
- if (timeout) {
- /* When service discovery is used and the controller has
- * a strict duplicate filter, it is important to remember
- * the start and duration of the scan. This is required
- * for restarting scanning during the discovery phase.
- */
- if (test_bit(HCI_QUIRK_STRICT_DUPLICATE_FILTER,
- &hdev->quirks) &&
- hdev->discovery.result_filtering) {
- hdev->discovery.scan_start = jiffies;
- hdev->discovery.scan_duration = timeout;
- }
-
- queue_delayed_work(hdev->workqueue,
- &hdev->le_scan_disable, timeout);
+ *mgmt_status = MGMT_STATUS_INVALID_PARAMS;
+ return false;
}
-unlock:
- hci_dev_unlock(hdev);
+ return true;
}
-static int start_discovery(struct sock *sk, struct hci_dev *hdev,
- void *data, u16 len)
+static int start_discovery_internal(struct sock *sk, struct hci_dev *hdev,
+ u16 op, void *data, u16 len)
{
struct mgmt_cp_start_discovery *cp = data;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
u8 status;
int err;
@@ -4389,7 +3346,7 @@ static int start_discovery(struct sock *sk, struct hci_dev *hdev,
hci_dev_lock(hdev);
if (!hdev_is_powered(hdev)) {
- err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_DISCOVERY,
+ err = mgmt_cmd_complete(sk, hdev->id, op,
MGMT_STATUS_NOT_POWERED,
&cp->type, sizeof(cp->type));
goto failed;
@@ -4397,20 +3354,17 @@ static int start_discovery(struct sock *sk, struct hci_dev *hdev,
if (hdev->discovery.state != DISCOVERY_STOPPED ||
hci_dev_test_flag(hdev, HCI_PERIODIC_INQ)) {
- err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_DISCOVERY,
- MGMT_STATUS_BUSY, &cp->type,
- sizeof(cp->type));
+ err = mgmt_cmd_complete(sk, hdev->id, op, MGMT_STATUS_BUSY,
+ &cp->type, sizeof(cp->type));
goto failed;
}
- cmd = mgmt_pending_add(sk, MGMT_OP_START_DISCOVERY, hdev, data, len);
- if (!cmd) {
- err = -ENOMEM;
+ if (!discovery_type_is_valid(hdev, cp->type, &status)) {
+ err = mgmt_cmd_complete(sk, hdev->id, op, status,
+ &cp->type, sizeof(cp->type));
goto failed;
}
- cmd->cmd_complete = generic_cmd_complete;
-
/* Clear the discovery filter first to free any previously
* allocated memory for the UUID list.
*/
@@ -4418,29 +3372,43 @@ static int start_discovery(struct sock *sk, struct hci_dev *hdev,
hdev->discovery.type = cp->type;
hdev->discovery.report_invalid_rssi = false;
+ if (op == MGMT_OP_START_LIMITED_DISCOVERY)
+ hdev->discovery.limited = true;
+ else
+ hdev->discovery.limited = false;
- hci_req_init(&req, hdev);
-
- if (!trigger_discovery(&req, &status)) {
- err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_START_DISCOVERY,
- status, &cp->type, sizeof(cp->type));
- mgmt_pending_remove(cmd);
+ cmd = mgmt_pending_add(sk, op, hdev, data, len);
+ if (!cmd) {
+ err = -ENOMEM;
goto failed;
}
- err = hci_req_run(&req, start_discovery_complete);
- if (err < 0) {
- mgmt_pending_remove(cmd);
- goto failed;
- }
+ cmd->cmd_complete = generic_cmd_complete;
hci_discovery_set_state(hdev, DISCOVERY_STARTING);
+ queue_work(hdev->req_workqueue, &hdev->discov_update);
+ err = 0;
failed:
hci_dev_unlock(hdev);
return err;
}
+static int start_discovery(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ return start_discovery_internal(sk, hdev, MGMT_OP_START_DISCOVERY,
+ data, len);
+}
+
+static int start_limited_discovery(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 len)
+{
+ return start_discovery_internal(sk, hdev,
+ MGMT_OP_START_LIMITED_DISCOVERY,
+ data, len);
+}
+
static int service_discovery_cmd_complete(struct mgmt_pending_cmd *cmd,
u8 status)
{
@@ -4453,7 +3421,6 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev,
{
struct mgmt_cp_start_service_discovery *cp = data;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
const u16 max_uuid_count = ((U16_MAX - sizeof(*cp)) / 16);
u16 uuid_count, expected_len;
u8 status;
@@ -4502,6 +3469,13 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev,
goto failed;
}
+ if (!discovery_type_is_valid(hdev, cp->type, &status)) {
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_START_SERVICE_DISCOVERY,
+ status, &cp->type, sizeof(cp->type));
+ goto failed;
+ }
+
cmd = mgmt_pending_add(sk, MGMT_OP_START_SERVICE_DISCOVERY,
hdev, data, len);
if (!cmd) {
@@ -4534,30 +3508,16 @@ static int start_service_discovery(struct sock *sk, struct hci_dev *hdev,
}
}
- hci_req_init(&req, hdev);
-
- if (!trigger_discovery(&req, &status)) {
- err = mgmt_cmd_complete(sk, hdev->id,
- MGMT_OP_START_SERVICE_DISCOVERY,
- status, &cp->type, sizeof(cp->type));
- mgmt_pending_remove(cmd);
- goto failed;
- }
-
- err = hci_req_run(&req, start_discovery_complete);
- if (err < 0) {
- mgmt_pending_remove(cmd);
- goto failed;
- }
-
hci_discovery_set_state(hdev, DISCOVERY_STARTING);
+ queue_work(hdev->req_workqueue, &hdev->discov_update);
+ err = 0;
failed:
hci_dev_unlock(hdev);
return err;
}
-static void stop_discovery_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+void mgmt_stop_discovery_complete(struct hci_dev *hdev, u8 status)
{
struct mgmt_pending_cmd *cmd;
@@ -4571,9 +3531,6 @@ static void stop_discovery_complete(struct hci_dev *hdev, u8 status, u16 opcode)
mgmt_pending_remove(cmd);
}
- if (!status)
- hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
-
hci_dev_unlock(hdev);
}
@@ -4582,7 +3539,6 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data,
{
struct mgmt_cp_stop_discovery *mgmt_cp = data;
struct mgmt_pending_cmd *cmd;
- struct hci_request req;
int err;
BT_DBG("%s", hdev->name);
@@ -4611,24 +3567,9 @@ static int stop_discovery(struct sock *sk, struct hci_dev *hdev, void *data,
cmd->cmd_complete = generic_cmd_complete;
- hci_req_init(&req, hdev);
-
- hci_stop_discovery(&req);
-
- err = hci_req_run(&req, stop_discovery_complete);
- if (!err) {
- hci_discovery_set_state(hdev, DISCOVERY_STOPPING);
- goto unlock;
- }
-
- mgmt_pending_remove(cmd);
-
- /* If no HCI commands were sent we're done */
- if (err == -ENODATA) {
- err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_STOP_DISCOVERY, 0,
- &mgmt_cp->type, sizeof(mgmt_cp->type));
- hci_discovery_set_state(hdev, DISCOVERY_STOPPED);
- }
+ hci_discovery_set_state(hdev, DISCOVERY_STOPPING);
+ queue_work(hdev->req_workqueue, &hdev->discov_update);
+ err = 0;
unlock:
hci_dev_unlock(hdev);
@@ -4776,7 +3717,7 @@ static int set_device_id(struct sock *sk, struct hci_dev *hdev, void *data,
NULL, 0);
hci_req_init(&req, hdev);
- update_eir(&req);
+ __hci_req_update_eir(&req);
hci_req_run(&req, NULL);
hci_dev_unlock(hdev);
@@ -4826,7 +3767,6 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status,
* set up earlier, then re-enable multi-instance advertising.
*/
if (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
- !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) ||
list_empty(&hdev->adv_instances))
goto unlock;
@@ -4842,7 +3782,7 @@ static void set_advertising_complete(struct hci_dev *hdev, u8 status,
hci_req_init(&req, hdev);
- err = schedule_adv_instance(&req, instance, true);
+ err = __hci_req_schedule_adv_instance(&req, instance, true);
if (!err)
err = hci_req_run(&req, enable_advertising_instance);
@@ -4892,6 +3832,7 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
bool changed;
if (cp->val) {
+ hdev->cur_adv_instance = 0x00;
changed = !hci_dev_test_and_set_flag(hdev, HCI_ADVERTISING);
if (cp->val == 0x02)
hci_dev_set_flag(hdev, HCI_ADVERTISING_CONNECTABLE);
@@ -4939,11 +3880,12 @@ static int set_advertising(struct sock *sk, struct hci_dev *hdev, void *data,
* We cannot use update_[adv|scan_rsp]_data() here as the
* HCI_ADVERTISING flag is not yet set.
*/
- update_inst_adv_data(&req, 0x00);
- update_inst_scan_rsp_data(&req, 0x00);
- enable_advertising(&req);
+ hdev->cur_adv_instance = 0x00;
+ __hci_req_update_adv_data(&req, 0x00);
+ __hci_req_update_scan_rsp_data(&req, 0x00);
+ __hci_req_enable_advertising(&req);
} else {
- disable_advertising(&req);
+ __hci_req_disable_advertising(&req);
}
err = hci_req_run(&req, set_advertising_complete);
@@ -5140,7 +4082,7 @@ static int set_fast_connectable(struct sock *sk, struct hci_dev *hdev,
hci_req_init(&req, hdev);
- write_fast_connectable(&req, cp->val);
+ __hci_req_write_fast_connectable(&req, cp->val);
err = hci_req_run(&req, fast_connectable_complete);
if (err < 0) {
@@ -5275,20 +4217,20 @@ static int set_bredr(struct sock *sk, struct hci_dev *hdev, void *data, u16 len)
goto unlock;
}
- /* We need to flip the bit already here so that update_adv_data
- * generates the correct flags.
+ /* We need to flip the bit already here so that
+ * hci_req_update_adv_data generates the correct flags.
*/
hci_dev_set_flag(hdev, HCI_BREDR_ENABLED);
hci_req_init(&req, hdev);
- write_fast_connectable(&req, false);
- __hci_update_page_scan(&req);
+ __hci_req_write_fast_connectable(&req, false);
+ __hci_req_update_scan(&req);
/* Since only the advertising data flags will change, there
* is no need to update the scan response data.
*/
- update_adv_data(&req);
+ __hci_req_update_adv_data(&req, hdev->cur_adv_instance);
err = hci_req_run(&req, set_bredr_complete);
if (err < 0)
@@ -5492,7 +4434,7 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY,
MGMT_STATUS_NOT_SUPPORTED);
- if (cp->privacy != 0x00 && cp->privacy != 0x01)
+ if (cp->privacy != 0x00 && cp->privacy != 0x01 && cp->privacy != 0x02)
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_SET_PRIVACY,
MGMT_STATUS_INVALID_PARAMS);
@@ -5511,10 +4453,15 @@ static int set_privacy(struct sock *sk, struct hci_dev *hdev, void *cp_data,
changed = !hci_dev_test_and_set_flag(hdev, HCI_PRIVACY);
memcpy(hdev->irk, cp->irk, sizeof(hdev->irk));
hci_dev_set_flag(hdev, HCI_RPA_EXPIRED);
+ if (cp->privacy == 0x02)
+ hci_dev_set_flag(hdev, HCI_LIMITED_PRIVACY);
+ else
+ hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY);
} else {
changed = hci_dev_test_and_clear_flag(hdev, HCI_PRIVACY);
memset(hdev->irk, 0, sizeof(hdev->irk));
hci_dev_clear_flag(hdev, HCI_RPA_EXPIRED);
+ hci_dev_clear_flag(hdev, HCI_LIMITED_PRIVACY);
}
err = send_settings_rsp(sk, MGMT_OP_SET_PRIVACY, hdev);
@@ -6076,10 +5023,9 @@ static bool is_connected(struct hci_dev *hdev, bdaddr_t *addr, u8 type)
}
/* This function requires the caller holds hdev->lock */
-static int hci_conn_params_set(struct hci_request *req, bdaddr_t *addr,
+static int hci_conn_params_set(struct hci_dev *hdev, bdaddr_t *addr,
u8 addr_type, u8 auto_connect)
{
- struct hci_dev *hdev = req->hdev;
struct hci_conn_params *params;
params = hci_conn_params_add(hdev, addr, addr_type);
@@ -6099,26 +5045,17 @@ static int hci_conn_params_set(struct hci_request *req, bdaddr_t *addr,
*/
if (params->explicit_connect)
list_add(&params->action, &hdev->pend_le_conns);
-
- __hci_update_background_scan(req);
break;
case HCI_AUTO_CONN_REPORT:
if (params->explicit_connect)
list_add(&params->action, &hdev->pend_le_conns);
else
list_add(&params->action, &hdev->pend_le_reports);
- __hci_update_background_scan(req);
break;
case HCI_AUTO_CONN_DIRECT:
case HCI_AUTO_CONN_ALWAYS:
- if (!is_connected(hdev, addr, addr_type)) {
+ if (!is_connected(hdev, addr, addr_type))
list_add(&params->action, &hdev->pend_le_conns);
- /* If we are in scan phase of connecting, we were
- * already added to pend_le_conns and scanning.
- */
- if (params->auto_connect != HCI_AUTO_CONN_EXPLICIT)
- __hci_update_background_scan(req);
- }
break;
}
@@ -6142,31 +5079,10 @@ static void device_added(struct sock *sk, struct hci_dev *hdev,
mgmt_event(MGMT_EV_DEVICE_ADDED, hdev, &ev, sizeof(ev), sk);
}
-static void add_device_complete(struct hci_dev *hdev, u8 status, u16 opcode)
-{
- struct mgmt_pending_cmd *cmd;
-
- BT_DBG("status 0x%02x", status);
-
- hci_dev_lock(hdev);
-
- cmd = pending_find(MGMT_OP_ADD_DEVICE, hdev);
- if (!cmd)
- goto unlock;
-
- cmd->cmd_complete(cmd, mgmt_status(status));
- mgmt_pending_remove(cmd);
-
-unlock:
- hci_dev_unlock(hdev);
-}
-
static int add_device(struct sock *sk, struct hci_dev *hdev,
void *data, u16 len)
{
struct mgmt_cp_add_device *cp = data;
- struct mgmt_pending_cmd *cmd;
- struct hci_request req;
u8 auto_conn, addr_type;
int err;
@@ -6183,24 +5099,15 @@ static int add_device(struct sock *sk, struct hci_dev *hdev,
MGMT_STATUS_INVALID_PARAMS,
&cp->addr, sizeof(cp->addr));
- hci_req_init(&req, hdev);
-
hci_dev_lock(hdev);
- cmd = mgmt_pending_add(sk, MGMT_OP_ADD_DEVICE, hdev, data, len);
- if (!cmd) {
- err = -ENOMEM;
- goto unlock;
- }
-
- cmd->cmd_complete = addr_cmd_complete;
-
if (cp->addr.type == BDADDR_BREDR) {
/* Only incoming connections action is supported for now */
if (cp->action != 0x01) {
- err = cmd->cmd_complete(cmd,
- MGMT_STATUS_INVALID_PARAMS);
- mgmt_pending_remove(cmd);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_ADD_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
goto unlock;
}
@@ -6209,7 +5116,7 @@ static int add_device(struct sock *sk, struct hci_dev *hdev,
if (err)
goto unlock;
- __hci_update_page_scan(&req);
+ hci_req_update_scan(hdev);
goto added;
}
@@ -6229,33 +5136,31 @@ static int add_device(struct sock *sk, struct hci_dev *hdev,
* hci_conn_params_lookup.
*/
if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) {
- err = cmd->cmd_complete(cmd, MGMT_STATUS_INVALID_PARAMS);
- mgmt_pending_remove(cmd);
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
goto unlock;
}
/* If the connection parameters don't exist for this device,
* they will be created and configured with defaults.
*/
- if (hci_conn_params_set(&req, &cp->addr.bdaddr, addr_type,
+ if (hci_conn_params_set(hdev, &cp->addr.bdaddr, addr_type,
auto_conn) < 0) {
- err = cmd->cmd_complete(cmd, MGMT_STATUS_FAILED);
- mgmt_pending_remove(cmd);
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE,
+ MGMT_STATUS_FAILED, &cp->addr,
+ sizeof(cp->addr));
goto unlock;
}
+ hci_update_background_scan(hdev);
+
added:
device_added(sk, hdev, &cp->addr.bdaddr, cp->addr.type, cp->action);
- err = hci_req_run(&req, add_device_complete);
- if (err < 0) {
- /* ENODATA means no HCI commands were needed (e.g. if
- * the adapter is powered off).
- */
- if (err == -ENODATA)
- err = cmd->cmd_complete(cmd, MGMT_STATUS_SUCCESS);
- mgmt_pending_remove(cmd);
- }
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_ADD_DEVICE,
+ MGMT_STATUS_SUCCESS, &cp->addr,
+ sizeof(cp->addr));
unlock:
hci_dev_unlock(hdev);
@@ -6273,55 +5178,25 @@ static void device_removed(struct sock *sk, struct hci_dev *hdev,
mgmt_event(MGMT_EV_DEVICE_REMOVED, hdev, &ev, sizeof(ev), sk);
}
-static void remove_device_complete(struct hci_dev *hdev, u8 status, u16 opcode)
-{
- struct mgmt_pending_cmd *cmd;
-
- BT_DBG("status 0x%02x", status);
-
- hci_dev_lock(hdev);
-
- cmd = pending_find(MGMT_OP_REMOVE_DEVICE, hdev);
- if (!cmd)
- goto unlock;
-
- cmd->cmd_complete(cmd, mgmt_status(status));
- mgmt_pending_remove(cmd);
-
-unlock:
- hci_dev_unlock(hdev);
-}
-
static int remove_device(struct sock *sk, struct hci_dev *hdev,
void *data, u16 len)
{
struct mgmt_cp_remove_device *cp = data;
- struct mgmt_pending_cmd *cmd;
- struct hci_request req;
int err;
BT_DBG("%s", hdev->name);
- hci_req_init(&req, hdev);
-
hci_dev_lock(hdev);
- cmd = mgmt_pending_add(sk, MGMT_OP_REMOVE_DEVICE, hdev, data, len);
- if (!cmd) {
- err = -ENOMEM;
- goto unlock;
- }
-
- cmd->cmd_complete = addr_cmd_complete;
-
if (bacmp(&cp->addr.bdaddr, BDADDR_ANY)) {
struct hci_conn_params *params;
u8 addr_type;
if (!bdaddr_type_is_valid(cp->addr.type)) {
- err = cmd->cmd_complete(cmd,
- MGMT_STATUS_INVALID_PARAMS);
- mgmt_pending_remove(cmd);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
goto unlock;
}
@@ -6330,13 +5205,15 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
&cp->addr.bdaddr,
cp->addr.type);
if (err) {
- err = cmd->cmd_complete(cmd,
- MGMT_STATUS_INVALID_PARAMS);
- mgmt_pending_remove(cmd);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr,
+ sizeof(cp->addr));
goto unlock;
}
- __hci_update_page_scan(&req);
+ hci_req_update_scan(hdev);
device_removed(sk, hdev, &cp->addr.bdaddr,
cp->addr.type);
@@ -6351,33 +5228,36 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
* hci_conn_params_lookup.
*/
if (!hci_is_identity_address(&cp->addr.bdaddr, addr_type)) {
- err = cmd->cmd_complete(cmd,
- MGMT_STATUS_INVALID_PARAMS);
- mgmt_pending_remove(cmd);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
goto unlock;
}
params = hci_conn_params_lookup(hdev, &cp->addr.bdaddr,
addr_type);
if (!params) {
- err = cmd->cmd_complete(cmd,
- MGMT_STATUS_INVALID_PARAMS);
- mgmt_pending_remove(cmd);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
goto unlock;
}
if (params->auto_connect == HCI_AUTO_CONN_DISABLED ||
params->auto_connect == HCI_AUTO_CONN_EXPLICIT) {
- err = cmd->cmd_complete(cmd,
- MGMT_STATUS_INVALID_PARAMS);
- mgmt_pending_remove(cmd);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
goto unlock;
}
list_del(&params->action);
list_del(&params->list);
kfree(params);
- __hci_update_background_scan(&req);
+ hci_update_background_scan(hdev);
device_removed(sk, hdev, &cp->addr.bdaddr, cp->addr.type);
} else {
@@ -6385,9 +5265,10 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
struct bdaddr_list *b, *btmp;
if (cp->addr.type) {
- err = cmd->cmd_complete(cmd,
- MGMT_STATUS_INVALID_PARAMS);
- mgmt_pending_remove(cmd);
+ err = mgmt_cmd_complete(sk, hdev->id,
+ MGMT_OP_REMOVE_DEVICE,
+ MGMT_STATUS_INVALID_PARAMS,
+ &cp->addr, sizeof(cp->addr));
goto unlock;
}
@@ -6397,7 +5278,7 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
kfree(b);
}
- __hci_update_page_scan(&req);
+ hci_req_update_scan(hdev);
list_for_each_entry_safe(p, tmp, &hdev->le_conn_params, list) {
if (p->auto_connect == HCI_AUTO_CONN_DISABLED)
@@ -6414,20 +5295,13 @@ static int remove_device(struct sock *sk, struct hci_dev *hdev,
BT_DBG("All LE connection parameters were removed");
- __hci_update_background_scan(&req);
+ hci_update_background_scan(hdev);
}
complete:
- err = hci_req_run(&req, remove_device_complete);
- if (err < 0) {
- /* ENODATA means no HCI commands were needed (e.g. if
- * the adapter is powered off).
- */
- if (err == -ENODATA)
- err = cmd->cmd_complete(cmd, MGMT_STATUS_SUCCESS);
- mgmt_pending_remove(cmd);
- }
-
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_REMOVE_DEVICE,
+ MGMT_STATUS_SUCCESS, &cp->addr,
+ sizeof(cp->addr));
unlock:
hci_dev_unlock(hdev);
return err;
@@ -6898,7 +5772,7 @@ static int read_local_oob_ext_data(struct sock *sk, struct hci_dev *hdev,
rand, sizeof(rand));
}
- flags = get_adv_discov_flags(hdev);
+ flags = mgmt_get_adv_discov_flags(hdev);
if (!hci_dev_test_flag(hdev, HCI_BREDR_ENABLED))
flags |= LE_AD_NO_BREDR;
@@ -6953,10 +5827,10 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
{
struct mgmt_rp_read_adv_features *rp;
size_t rp_len;
- int err, i;
- bool instance;
+ int err;
struct adv_info *adv_instance;
u32 supported_flags;
+ u8 *instance;
BT_DBG("%s", hdev->name);
@@ -6966,12 +5840,7 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
hci_dev_lock(hdev);
- rp_len = sizeof(*rp);
-
- instance = hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE);
- if (instance)
- rp_len += hdev->adv_instance_cnt;
-
+ rp_len = sizeof(*rp) + hdev->adv_instance_cnt;
rp = kmalloc(rp_len, GFP_ATOMIC);
if (!rp) {
hci_dev_unlock(hdev);
@@ -6984,19 +5853,12 @@ static int read_adv_features(struct sock *sk, struct hci_dev *hdev,
rp->max_adv_data_len = HCI_MAX_AD_LENGTH;
rp->max_scan_rsp_len = HCI_MAX_AD_LENGTH;
rp->max_instances = HCI_MAX_ADV_INSTANCES;
+ rp->num_instances = hdev->adv_instance_cnt;
- if (instance) {
- i = 0;
- list_for_each_entry(adv_instance, &hdev->adv_instances, list) {
- if (i >= hdev->adv_instance_cnt)
- break;
-
- rp->instance[i] = adv_instance->instance;
- i++;
- }
- rp->num_instances = hdev->adv_instance_cnt;
- } else {
- rp->num_instances = 0;
+ instance = rp->instance;
+ list_for_each_entry(adv_instance, &hdev->adv_instances, list) {
+ *instance = adv_instance->instance;
+ instance++;
}
hci_dev_unlock(hdev);
@@ -7016,17 +5878,19 @@ static bool tlv_data_is_valid(struct hci_dev *hdev, u32 adv_flags, u8 *data,
int i, cur_len;
bool flags_managed = false;
bool tx_power_managed = false;
- u32 flags_params = MGMT_ADV_FLAG_DISCOV | MGMT_ADV_FLAG_LIMITED_DISCOV |
- MGMT_ADV_FLAG_MANAGED_FLAGS;
- if (is_adv_data && (adv_flags & flags_params)) {
- flags_managed = true;
- max_len -= 3;
- }
+ if (is_adv_data) {
+ if (adv_flags & (MGMT_ADV_FLAG_DISCOV |
+ MGMT_ADV_FLAG_LIMITED_DISCOV |
+ MGMT_ADV_FLAG_MANAGED_FLAGS)) {
+ flags_managed = true;
+ max_len -= 3;
+ }
- if (is_adv_data && (adv_flags & MGMT_ADV_FLAG_TX_POWER)) {
- tx_power_managed = true;
- max_len -= 3;
+ if (adv_flags & MGMT_ADV_FLAG_TX_POWER) {
+ tx_power_managed = true;
+ max_len -= 3;
+ }
}
if (len > max_len)
@@ -7067,9 +5931,6 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status,
cmd = pending_find(MGMT_OP_ADD_ADVERTISING, hdev);
- if (status)
- hci_dev_clear_flag(hdev, HCI_ADVERTISING_INSTANCE);
-
list_for_each_entry_safe(adv_instance, n, &hdev->adv_instances, list) {
if (!adv_instance->pending)
continue;
@@ -7085,7 +5946,7 @@ static void add_advertising_complete(struct hci_dev *hdev, u8 status,
cancel_adv_timeout(hdev);
hci_remove_adv_instance(hdev, instance);
- advertising_removed(cmd ? cmd->sk : NULL, hdev, instance);
+ mgmt_advertising_removed(cmd ? cmd->sk : NULL, hdev, instance);
}
if (!cmd)
@@ -7107,31 +5968,6 @@ unlock:
hci_dev_unlock(hdev);
}
-void mgmt_adv_timeout_expired(struct hci_dev *hdev)
-{
- u8 instance;
- struct hci_request req;
-
- hdev->adv_instance_timeout = 0;
-
- instance = get_current_adv_instance(hdev);
- if (instance == 0x00)
- return;
-
- hci_dev_lock(hdev);
- hci_req_init(&req, hdev);
-
- clear_adv_instance(hdev, &req, instance, false);
-
- if (list_empty(&hdev->adv_instances))
- disable_advertising(&req);
-
- if (!skb_queue_empty(&req.cmd_q))
- hci_req_run(&req, NULL);
-
- hci_dev_unlock(hdev);
-}
-
static int add_advertising(struct sock *sk, struct hci_dev *hdev,
void *data, u16 data_len)
{
@@ -7155,6 +5991,14 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
status);
+ if (cp->instance < 1 || cp->instance > HCI_MAX_ADV_INSTANCES)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ if (data_len != sizeof(*cp) + cp->adv_data_len + cp->scan_rsp_len)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_ADD_ADVERTISING,
+ MGMT_STATUS_INVALID_PARAMS);
+
flags = __le32_to_cpu(cp->flags);
timeout = __le16_to_cpu(cp->timeout);
duration = __le16_to_cpu(cp->duration);
@@ -7206,9 +6050,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
* actually added.
*/
if (hdev->adv_instance_cnt > prev_instance_cnt)
- advertising_added(sk, hdev, cp->instance);
-
- hci_dev_set_flag(hdev, HCI_ADVERTISING_INSTANCE);
+ mgmt_advertising_added(sk, hdev, cp->instance);
if (hdev->cur_adv_instance == cp->instance) {
/* If the currently advertised instance is being changed then
@@ -7253,7 +6095,7 @@ static int add_advertising(struct sock *sk, struct hci_dev *hdev,
hci_req_init(&req, hdev);
- err = schedule_adv_instance(&req, schedule_instance, true);
+ err = __hci_req_schedule_adv_instance(&req, schedule_instance, true);
if (!err)
err = hci_req_run(&req, add_advertising_complete);
@@ -7325,7 +6167,7 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
goto unlock;
}
- if (!hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE)) {
+ if (list_empty(&hdev->adv_instances)) {
err = mgmt_cmd_status(sk, hdev->id, MGMT_OP_REMOVE_ADVERTISING,
MGMT_STATUS_INVALID_PARAMS);
goto unlock;
@@ -7333,10 +6175,10 @@ static int remove_advertising(struct sock *sk, struct hci_dev *hdev,
hci_req_init(&req, hdev);
- clear_adv_instance(hdev, &req, cp->instance, true);
+ hci_req_clear_adv_instance(hdev, &req, cp->instance, true);
if (list_empty(&hdev->adv_instances))
- disable_advertising(&req);
+ __hci_req_disable_advertising(&req);
/* If no HCI commands have been collected so far or the HCI_ADVERTISING
* flag is set or the device isn't powered then we have no HCI
@@ -7369,6 +6211,62 @@ unlock:
return err;
}
+static u8 tlv_data_max_len(u32 adv_flags, bool is_adv_data)
+{
+ u8 max_len = HCI_MAX_AD_LENGTH;
+
+ if (is_adv_data) {
+ if (adv_flags & (MGMT_ADV_FLAG_DISCOV |
+ MGMT_ADV_FLAG_LIMITED_DISCOV |
+ MGMT_ADV_FLAG_MANAGED_FLAGS))
+ max_len -= 3;
+
+ if (adv_flags & MGMT_ADV_FLAG_TX_POWER)
+ max_len -= 3;
+ }
+
+ return max_len;
+}
+
+static int get_adv_size_info(struct sock *sk, struct hci_dev *hdev,
+ void *data, u16 data_len)
+{
+ struct mgmt_cp_get_adv_size_info *cp = data;
+ struct mgmt_rp_get_adv_size_info rp;
+ u32 flags, supported_flags;
+ int err;
+
+ BT_DBG("%s", hdev->name);
+
+ if (!lmp_le_capable(hdev))
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
+ MGMT_STATUS_REJECTED);
+
+ if (cp->instance < 1 || cp->instance > HCI_MAX_ADV_INSTANCES)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ flags = __le32_to_cpu(cp->flags);
+
+ /* The current implementation only supports a subset of the specified
+ * flags.
+ */
+ supported_flags = get_supported_adv_flags(hdev);
+ if (flags & ~supported_flags)
+ return mgmt_cmd_status(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
+ MGMT_STATUS_INVALID_PARAMS);
+
+ rp.instance = cp->instance;
+ rp.flags = cp->flags;
+ rp.max_adv_data_len = tlv_data_max_len(flags, true);
+ rp.max_scan_rsp_len = tlv_data_max_len(flags, false);
+
+ err = mgmt_cmd_complete(sk, hdev->id, MGMT_OP_GET_ADV_SIZE_INFO,
+ MGMT_STATUS_SUCCESS, &rp, sizeof(rp));
+
+ return err;
+}
+
static const struct hci_mgmt_handler mgmt_handlers[] = {
{ NULL }, /* 0x0000 (no command) */
{ read_version, MGMT_READ_VERSION_SIZE,
@@ -7456,6 +6354,8 @@ static const struct hci_mgmt_handler mgmt_handlers[] = {
{ add_advertising, MGMT_ADD_ADVERTISING_SIZE,
HCI_MGMT_VAR_LEN },
{ remove_advertising, MGMT_REMOVE_ADVERTISING_SIZE },
+ { get_adv_size_info, MGMT_GET_ADV_SIZE_INFO_SIZE },
+ { start_limited_discovery, MGMT_START_DISCOVERY_SIZE },
};
void mgmt_index_added(struct hci_dev *hdev)
@@ -7526,9 +6426,8 @@ void mgmt_index_removed(struct hci_dev *hdev)
}
/* This function requires the caller holds hdev->lock */
-static void restart_le_actions(struct hci_request *req)
+static void restart_le_actions(struct hci_dev *hdev)
{
- struct hci_dev *hdev = req->hdev;
struct hci_conn_params *p;
list_for_each_entry(p, &hdev->le_conn_params, list) {
@@ -7549,141 +6448,35 @@ static void restart_le_actions(struct hci_request *req)
break;
}
}
-
- __hci_update_background_scan(req);
}
-static void powered_complete(struct hci_dev *hdev, u8 status, u16 opcode)
+void mgmt_power_on(struct hci_dev *hdev, int err)
{
struct cmd_lookup match = { NULL, hdev };
- BT_DBG("status 0x%02x", status);
-
- if (!status) {
- /* Register the available SMP channels (BR/EDR and LE) only
- * when successfully powering on the controller. This late
- * registration is required so that LE SMP can clearly
- * decide if the public address or static address is used.
- */
- smp_register(hdev);
- }
+ BT_DBG("err %d", err);
hci_dev_lock(hdev);
+ if (!err) {
+ restart_le_actions(hdev);
+ hci_update_background_scan(hdev);
+ }
+
mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match);
new_settings(hdev, match.sk);
- hci_dev_unlock(hdev);
-
if (match.sk)
sock_put(match.sk);
-}
-
-static int powered_update_hci(struct hci_dev *hdev)
-{
- struct hci_request req;
- struct adv_info *adv_instance;
- u8 link_sec;
-
- hci_req_init(&req, hdev);
-
- if (hci_dev_test_flag(hdev, HCI_SSP_ENABLED) &&
- !lmp_host_ssp_capable(hdev)) {
- u8 mode = 0x01;
-
- hci_req_add(&req, HCI_OP_WRITE_SSP_MODE, sizeof(mode), &mode);
-
- if (bredr_sc_enabled(hdev) && !lmp_host_sc_capable(hdev)) {
- u8 support = 0x01;
-
- hci_req_add(&req, HCI_OP_WRITE_SC_SUPPORT,
- sizeof(support), &support);
- }
- }
-
- if (hci_dev_test_flag(hdev, HCI_LE_ENABLED) &&
- lmp_bredr_capable(hdev)) {
- struct hci_cp_write_le_host_supported cp;
-
- cp.le = 0x01;
- cp.simul = 0x00;
-
- /* Check first if we already have the right
- * host state (host features set)
- */
- if (cp.le != lmp_host_le_capable(hdev) ||
- cp.simul != lmp_host_le_br_capable(hdev))
- hci_req_add(&req, HCI_OP_WRITE_LE_HOST_SUPPORTED,
- sizeof(cp), &cp);
- }
-
- if (lmp_le_capable(hdev)) {
- /* Make sure the controller has a good default for
- * advertising data. This also applies to the case
- * where BR/EDR was toggled during the AUTO_OFF phase.
- */
- if (hci_dev_test_flag(hdev, HCI_LE_ENABLED) &&
- (hci_dev_test_flag(hdev, HCI_ADVERTISING) ||
- !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))) {
- update_adv_data(&req);
- update_scan_rsp_data(&req);
- }
-
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) &&
- hdev->cur_adv_instance == 0x00 &&
- !list_empty(&hdev->adv_instances)) {
- adv_instance = list_first_entry(&hdev->adv_instances,
- struct adv_info, list);
- hdev->cur_adv_instance = adv_instance->instance;
- }
-
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
- enable_advertising(&req);
- else if (hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE) &&
- hdev->cur_adv_instance)
- schedule_adv_instance(&req, hdev->cur_adv_instance,
- true);
-
- restart_le_actions(&req);
- }
-
- link_sec = hci_dev_test_flag(hdev, HCI_LINK_SECURITY);
- if (link_sec != test_bit(HCI_AUTH, &hdev->flags))
- hci_req_add(&req, HCI_OP_WRITE_AUTH_ENABLE,
- sizeof(link_sec), &link_sec);
- if (lmp_bredr_capable(hdev)) {
- if (hci_dev_test_flag(hdev, HCI_FAST_CONNECTABLE))
- write_fast_connectable(&req, true);
- else
- write_fast_connectable(&req, false);
- __hci_update_page_scan(&req);
- update_class(&req);
- update_name(&req);
- update_eir(&req);
- }
-
- return hci_req_run(&req, powered_complete);
+ hci_dev_unlock(hdev);
}
-int mgmt_powered(struct hci_dev *hdev, u8 powered)
+void __mgmt_power_off(struct hci_dev *hdev)
{
struct cmd_lookup match = { NULL, hdev };
u8 status, zero_cod[] = { 0, 0, 0 };
- int err;
-
- if (!hci_dev_test_flag(hdev, HCI_MGMT))
- return 0;
-
- if (powered) {
- if (powered_update_hci(hdev) == 0)
- return 0;
-
- mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp,
- &match);
- goto new_settings;
- }
mgmt_pending_foreach(MGMT_OP_SET_POWERED, hdev, settings_rsp, &match);
@@ -7705,13 +6498,10 @@ int mgmt_powered(struct hci_dev *hdev, u8 powered)
mgmt_generic_event(MGMT_EV_CLASS_OF_DEV_CHANGED, hdev,
zero_cod, sizeof(zero_cod), NULL);
-new_settings:
- err = new_settings(hdev, match.sk);
+ new_settings(hdev, match.sk);
if (match.sk)
sock_put(match.sk);
-
- return err;
}
void mgmt_set_powered_failed(struct hci_dev *hdev, int err)
@@ -7733,43 +6523,6 @@ void mgmt_set_powered_failed(struct hci_dev *hdev, int err)
mgmt_pending_remove(cmd);
}
-void mgmt_discoverable_timeout(struct hci_dev *hdev)
-{
- struct hci_request req;
-
- hci_dev_lock(hdev);
-
- /* When discoverable timeout triggers, then just make sure
- * the limited discoverable flag is cleared. Even in the case
- * of a timeout triggered from general discoverable, it is
- * safe to unconditionally clear the flag.
- */
- hci_dev_clear_flag(hdev, HCI_LIMITED_DISCOVERABLE);
- hci_dev_clear_flag(hdev, HCI_DISCOVERABLE);
-
- hci_req_init(&req, hdev);
- if (hci_dev_test_flag(hdev, HCI_BREDR_ENABLED)) {
- u8 scan = SCAN_PAGE;
- hci_req_add(&req, HCI_OP_WRITE_SCAN_ENABLE,
- sizeof(scan), &scan);
- }
- update_class(&req);
-
- /* Advertising instances don't use the global discoverable setting, so
- * only update AD if advertising was enabled using Set Advertising.
- */
- if (hci_dev_test_flag(hdev, HCI_ADVERTISING))
- update_adv_data(&req);
-
- hci_req_run(&req, NULL);
-
- hdev->discov_timeout = 0;
-
- new_settings(hdev, NULL);
-
- hci_dev_unlock(hdev);
-}
-
void mgmt_new_link_key(struct hci_dev *hdev, struct link_key *key,
bool persistent)
{
@@ -8312,7 +7065,7 @@ void mgmt_ssp_enable_complete(struct hci_dev *hdev, u8 enable, u8 status)
if (hci_dev_test_flag(hdev, HCI_USE_DEBUG_KEYS))
hci_req_add(&req, HCI_OP_WRITE_SSP_DEBUG_MODE,
sizeof(enable), &enable);
- update_eir(&req);
+ __hci_req_update_eir(&req);
} else {
clear_eir(&req);
}
@@ -8452,7 +7205,7 @@ static void restart_le_scan(struct hci_dev *hdev)
hdev->discovery.scan_duration))
return;
- queue_delayed_work(hdev->workqueue, &hdev->le_scan_restart,
+ queue_delayed_work(hdev->req_workqueue, &hdev->le_scan_restart,
DISCOV_LE_RESTART_DELAY);
}
@@ -8527,6 +7280,18 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
return;
}
+ if (hdev->discovery.limited) {
+ /* Check for limited discoverable bit */
+ if (dev_class) {
+ if (!(dev_class[1] & 0x20))
+ return;
+ } else {
+ u8 *flags = eir_get_data(eir, eir_len, EIR_FLAGS, NULL);
+ if (!flags || !(flags[0] & LE_AD_LIMITED))
+ return;
+ }
+ }
+
/* Make sure that the buffer is big enough. The 5 extra bytes
* are for the potential CoD field.
*/
@@ -8556,7 +7321,8 @@ void mgmt_device_found(struct hci_dev *hdev, bdaddr_t *bdaddr, u8 link_type,
/* Copy EIR or advertising data into event */
memcpy(ev->eir, eir, eir_len);
- if (dev_class && !eir_has_data_type(ev->eir, eir_len, EIR_CLASS_OF_DEV))
+ if (dev_class && !eir_get_data(ev->eir, eir_len, EIR_CLASS_OF_DEV,
+ NULL))
eir_len = eir_append_data(ev->eir, eir_len, EIR_CLASS_OF_DEV,
dev_class, 3);
@@ -8606,35 +7372,6 @@ void mgmt_discovering(struct hci_dev *hdev, u8 discovering)
mgmt_event(MGMT_EV_DISCOVERING, hdev, &ev, sizeof(ev), NULL);
}
-static void adv_enable_complete(struct hci_dev *hdev, u8 status, u16 opcode)
-{
- BT_DBG("%s status %u", hdev->name, status);
-}
-
-void mgmt_reenable_advertising(struct hci_dev *hdev)
-{
- struct hci_request req;
- u8 instance;
-
- if (!hci_dev_test_flag(hdev, HCI_ADVERTISING) &&
- !hci_dev_test_flag(hdev, HCI_ADVERTISING_INSTANCE))
- return;
-
- instance = get_current_adv_instance(hdev);
-
- hci_req_init(&req, hdev);
-
- if (instance) {
- schedule_adv_instance(&req, instance, true);
- } else {
- update_adv_data(&req);
- update_scan_rsp_data(&req);
- enable_advertising(&req);
- }
-
- hci_req_run(&req, adv_enable_complete);
-}
-
static struct hci_mgmt_chan chan = {
.channel = HCI_CHANNEL_CONTROL,
.handler_count = ARRAY_SIZE(mgmt_handlers),
diff --git a/net/bluetooth/rfcomm/core.c b/net/bluetooth/rfcomm/core.c
index 29709fbfd1f5..f7eb02f09b54 100644
--- a/net/bluetooth/rfcomm/core.c
+++ b/net/bluetooth/rfcomm/core.c
@@ -692,11 +692,9 @@ static struct rfcomm_session *rfcomm_session_del(struct rfcomm_session *s)
static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst)
{
- struct rfcomm_session *s;
- struct list_head *p, *n;
+ struct rfcomm_session *s, *n;
struct l2cap_chan *chan;
- list_for_each_safe(p, n, &session_list) {
- s = list_entry(p, struct rfcomm_session, list);
+ list_for_each_entry_safe(s, n, &session_list, list) {
chan = l2cap_pi(s->sock->sk)->chan;
if ((!bacmp(src, BDADDR_ANY) || !bacmp(&chan->src, src)) &&
@@ -709,16 +707,14 @@ static struct rfcomm_session *rfcomm_session_get(bdaddr_t *src, bdaddr_t *dst)
static struct rfcomm_session *rfcomm_session_close(struct rfcomm_session *s,
int err)
{
- struct rfcomm_dlc *d;
- struct list_head *p, *n;
+ struct rfcomm_dlc *d, *n;
s->state = BT_CLOSED;
BT_DBG("session %p state %ld err %d", s, s->state, err);
/* Close all dlcs */
- list_for_each_safe(p, n, &s->dlcs) {
- d = list_entry(p, struct rfcomm_dlc, list);
+ list_for_each_entry_safe(d, n, &s->dlcs, list) {
d->state = BT_CLOSED;
__rfcomm_dlc_close(d, err);
}
@@ -1771,13 +1767,11 @@ static struct rfcomm_session *rfcomm_recv_frame(struct rfcomm_session *s,
static void rfcomm_process_connect(struct rfcomm_session *s)
{
- struct rfcomm_dlc *d;
- struct list_head *p, *n;
+ struct rfcomm_dlc *d, *n;
BT_DBG("session %p state %ld", s, s->state);
- list_for_each_safe(p, n, &s->dlcs) {
- d = list_entry(p, struct rfcomm_dlc, list);
+ list_for_each_entry_safe(d, n, &s->dlcs, list) {
if (d->state == BT_CONFIG) {
d->mtu = s->mtu;
if (rfcomm_check_security(d)) {
@@ -1843,14 +1837,11 @@ static int rfcomm_process_tx(struct rfcomm_dlc *d)
static void rfcomm_process_dlcs(struct rfcomm_session *s)
{
- struct rfcomm_dlc *d;
- struct list_head *p, *n;
+ struct rfcomm_dlc *d, *n;
BT_DBG("session %p state %ld", s, s->state);
- list_for_each_safe(p, n, &s->dlcs) {
- d = list_entry(p, struct rfcomm_dlc, list);
-
+ list_for_each_entry_safe(d, n, &s->dlcs, list) {
if (test_bit(RFCOMM_TIMED_OUT, &d->flags)) {
__rfcomm_dlc_close(d, ETIMEDOUT);
continue;
@@ -1985,14 +1976,11 @@ static struct rfcomm_session *rfcomm_check_connection(struct rfcomm_session *s)
static void rfcomm_process_sessions(void)
{
- struct list_head *p, *n;
+ struct rfcomm_session *s, *n;
rfcomm_lock();
- list_for_each_safe(p, n, &session_list) {
- struct rfcomm_session *s;
- s = list_entry(p, struct rfcomm_session, list);
-
+ list_for_each_entry_safe(s, n, &session_list, list) {
if (test_and_clear_bit(RFCOMM_TIMED_OUT, &s->flags)) {
s->state = BT_DISCONN;
rfcomm_send_disc(s, 0);
@@ -2075,15 +2063,12 @@ failed:
static void rfcomm_kill_listener(void)
{
- struct rfcomm_session *s;
- struct list_head *p, *n;
+ struct rfcomm_session *s, *n;
BT_DBG("");
- list_for_each_safe(p, n, &session_list) {
- s = list_entry(p, struct rfcomm_session, list);
+ list_for_each_entry_safe(s, n, &session_list, list)
rfcomm_session_del(s);
- }
}
static int rfcomm_run(void *unused)
@@ -2113,8 +2098,7 @@ static int rfcomm_run(void *unused)
static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
{
struct rfcomm_session *s;
- struct rfcomm_dlc *d;
- struct list_head *p, *n;
+ struct rfcomm_dlc *d, *n;
BT_DBG("conn %p status 0x%02x encrypt 0x%02x", conn, status, encrypt);
@@ -2122,9 +2106,7 @@ static void rfcomm_security_cfm(struct hci_conn *conn, u8 status, u8 encrypt)
if (!s)
return;
- list_for_each_safe(p, n, &s->dlcs) {
- d = list_entry(p, struct rfcomm_dlc, list);
-
+ list_for_each_entry_safe(d, n, &s->dlcs, list) {
if (test_and_clear_bit(RFCOMM_SEC_PENDING, &d->flags)) {
rfcomm_dlc_clear_timer(d);
if (status || encrypt == 0x00) {
diff --git a/net/bluetooth/sco.c b/net/bluetooth/sco.c
index fe129663bd3f..f52bcbf2e58c 100644
--- a/net/bluetooth/sco.c
+++ b/net/bluetooth/sco.c
@@ -526,6 +526,9 @@ static int sco_sock_bind(struct socket *sock, struct sockaddr *addr,
if (!addr || addr->sa_family != AF_BLUETOOTH)
return -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_sco))
+ return -EINVAL;
+
lock_sock(sk);
if (sk->sk_state != BT_OPEN) {
diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
index c91353841e40..50976a6481f3 100644
--- a/net/bluetooth/smp.c
+++ b/net/bluetooth/smp.c
@@ -21,9 +21,10 @@
*/
#include <linux/debugfs.h>
-#include <linux/crypto.h>
#include <linux/scatterlist.h>
#include <crypto/b128ops.h>
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
#include <net/bluetooth/bluetooth.h>
#include <net/bluetooth/hci_core.h>
@@ -87,8 +88,8 @@ struct smp_dev {
u8 min_key_size;
u8 max_key_size;
- struct crypto_blkcipher *tfm_aes;
- struct crypto_hash *tfm_cmac;
+ struct crypto_skcipher *tfm_aes;
+ struct crypto_shash *tfm_cmac;
};
struct smp_chan {
@@ -126,8 +127,8 @@ struct smp_chan {
u8 dhkey[32];
u8 mackey[16];
- struct crypto_blkcipher *tfm_aes;
- struct crypto_hash *tfm_cmac;
+ struct crypto_skcipher *tfm_aes;
+ struct crypto_shash *tfm_cmac;
};
/* These debug key values are defined in the SMP section of the core
@@ -165,12 +166,11 @@ static inline void swap_buf(const u8 *src, u8 *dst, size_t len)
* AES-CMAC, f4, f5, f6, g2 and h6.
*/
-static int aes_cmac(struct crypto_hash *tfm, const u8 k[16], const u8 *m,
+static int aes_cmac(struct crypto_shash *tfm, const u8 k[16], const u8 *m,
size_t len, u8 mac[16])
{
uint8_t tmp[16], mac_msb[16], msg_msb[CMAC_MSG_MAX];
- struct hash_desc desc;
- struct scatterlist sg;
+ SHASH_DESC_ON_STACK(desc, tfm);
int err;
if (len > CMAC_MSG_MAX)
@@ -181,10 +181,8 @@ static int aes_cmac(struct crypto_hash *tfm, const u8 k[16], const u8 *m,
return -EINVAL;
}
- desc.tfm = tfm;
- desc.flags = 0;
-
- crypto_hash_init(&desc);
+ desc->tfm = tfm;
+ desc->flags = 0;
/* Swap key and message from LSB to MSB */
swap_buf(k, tmp, 16);
@@ -193,23 +191,16 @@ static int aes_cmac(struct crypto_hash *tfm, const u8 k[16], const u8 *m,
SMP_DBG("msg (len %zu) %*phN", len, (int) len, m);
SMP_DBG("key %16phN", k);
- err = crypto_hash_setkey(tfm, tmp, 16);
+ err = crypto_shash_setkey(tfm, tmp, 16);
if (err) {
BT_ERR("cipher setkey failed: %d", err);
return err;
}
- sg_init_one(&sg, msg_msb, len);
-
- err = crypto_hash_update(&desc, &sg, len);
- if (err) {
- BT_ERR("Hash update error %d", err);
- return err;
- }
-
- err = crypto_hash_final(&desc, mac_msb);
+ err = crypto_shash_digest(desc, msg_msb, len, mac_msb);
+ shash_desc_zero(desc);
if (err) {
- BT_ERR("Hash final error %d", err);
+ BT_ERR("Hash computation error %d", err);
return err;
}
@@ -220,8 +211,8 @@ static int aes_cmac(struct crypto_hash *tfm, const u8 k[16], const u8 *m,
return 0;
}
-static int smp_f4(struct crypto_hash *tfm_cmac, const u8 u[32], const u8 v[32],
- const u8 x[16], u8 z, u8 res[16])
+static int smp_f4(struct crypto_shash *tfm_cmac, const u8 u[32],
+ const u8 v[32], const u8 x[16], u8 z, u8 res[16])
{
u8 m[65];
int err;
@@ -243,7 +234,7 @@ static int smp_f4(struct crypto_hash *tfm_cmac, const u8 u[32], const u8 v[32],
return err;
}
-static int smp_f5(struct crypto_hash *tfm_cmac, const u8 w[32],
+static int smp_f5(struct crypto_shash *tfm_cmac, const u8 w[32],
const u8 n1[16], const u8 n2[16], const u8 a1[7],
const u8 a2[7], u8 mackey[16], u8 ltk[16])
{
@@ -296,7 +287,7 @@ static int smp_f5(struct crypto_hash *tfm_cmac, const u8 w[32],
return 0;
}
-static int smp_f6(struct crypto_hash *tfm_cmac, const u8 w[16],
+static int smp_f6(struct crypto_shash *tfm_cmac, const u8 w[16],
const u8 n1[16], const u8 n2[16], const u8 r[16],
const u8 io_cap[3], const u8 a1[7], const u8 a2[7],
u8 res[16])
@@ -324,7 +315,7 @@ static int smp_f6(struct crypto_hash *tfm_cmac, const u8 w[16],
return err;
}
-static int smp_g2(struct crypto_hash *tfm_cmac, const u8 u[32], const u8 v[32],
+static int smp_g2(struct crypto_shash *tfm_cmac, const u8 u[32], const u8 v[32],
const u8 x[16], const u8 y[16], u32 *val)
{
u8 m[80], tmp[16];
@@ -350,7 +341,7 @@ static int smp_g2(struct crypto_hash *tfm_cmac, const u8 u[32], const u8 v[32],
return 0;
}
-static int smp_h6(struct crypto_hash *tfm_cmac, const u8 w[16],
+static int smp_h6(struct crypto_shash *tfm_cmac, const u8 w[16],
const u8 key_id[4], u8 res[16])
{
int err;
@@ -370,9 +361,9 @@ static int smp_h6(struct crypto_hash *tfm_cmac, const u8 w[16],
* s1 and ah.
*/
-static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r)
+static int smp_e(struct crypto_skcipher *tfm, const u8 *k, u8 *r)
{
- struct blkcipher_desc desc;
+ SKCIPHER_REQUEST_ON_STACK(req, tfm);
struct scatterlist sg;
uint8_t tmp[16], data[16];
int err;
@@ -384,13 +375,10 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r)
return -EINVAL;
}
- desc.tfm = tfm;
- desc.flags = 0;
-
/* The most significant octet of key corresponds to k[0] */
swap_buf(k, tmp, 16);
- err = crypto_blkcipher_setkey(tfm, tmp, 16);
+ err = crypto_skcipher_setkey(tfm, tmp, 16);
if (err) {
BT_ERR("cipher setkey failed: %d", err);
return err;
@@ -401,7 +389,12 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r)
sg_init_one(&sg, data, 16);
- err = crypto_blkcipher_encrypt(&desc, &sg, &sg, 16);
+ skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &sg, &sg, 16, NULL);
+
+ err = crypto_skcipher_encrypt(req);
+ skcipher_request_zero(req);
if (err)
BT_ERR("Encrypt data error %d", err);
@@ -413,7 +406,7 @@ static int smp_e(struct crypto_blkcipher *tfm, const u8 *k, u8 *r)
return err;
}
-static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16],
+static int smp_c1(struct crypto_skcipher *tfm_aes, const u8 k[16],
const u8 r[16], const u8 preq[7], const u8 pres[7], u8 _iat,
const bdaddr_t *ia, u8 _rat, const bdaddr_t *ra, u8 res[16])
{
@@ -462,7 +455,7 @@ static int smp_c1(struct crypto_blkcipher *tfm_aes, const u8 k[16],
return err;
}
-static int smp_s1(struct crypto_blkcipher *tfm_aes, const u8 k[16],
+static int smp_s1(struct crypto_skcipher *tfm_aes, const u8 k[16],
const u8 r1[16], const u8 r2[16], u8 _r[16])
{
int err;
@@ -478,7 +471,7 @@ static int smp_s1(struct crypto_blkcipher *tfm_aes, const u8 k[16],
return err;
}
-static int smp_ah(struct crypto_blkcipher *tfm, const u8 irk[16],
+static int smp_ah(struct crypto_skcipher *tfm, const u8 irk[16],
const u8 r[3], u8 res[3])
{
u8 _res[16];
@@ -766,8 +759,8 @@ static void smp_chan_destroy(struct l2cap_conn *conn)
kzfree(smp->slave_csrk);
kzfree(smp->link_key);
- crypto_free_blkcipher(smp->tfm_aes);
- crypto_free_hash(smp->tfm_cmac);
+ crypto_free_skcipher(smp->tfm_aes);
+ crypto_free_shash(smp->tfm_cmac);
/* Ensure that we don't leave any debug key around if debug key
* support hasn't been explicitly enabled.
@@ -1072,22 +1065,6 @@ static void smp_notify_keys(struct l2cap_conn *conn)
hcon->dst_type = smp->remote_irk->addr_type;
queue_work(hdev->workqueue, &conn->id_addr_update_work);
}
-
- /* When receiving an indentity resolving key for
- * a remote device that does not use a resolvable
- * private address, just remove the key so that
- * it is possible to use the controller white
- * list for scanning.
- *
- * Userspace will have been told to not store
- * this key at this point. So it is safe to
- * just remove it.
- */
- if (!bacmp(&smp->remote_irk->rpa, BDADDR_ANY)) {
- list_del_rcu(&smp->remote_irk->list);
- kfree_rcu(smp->remote_irk, rcu);
- smp->remote_irk = NULL;
- }
}
if (smp->csrk) {
@@ -1382,17 +1359,17 @@ static struct smp_chan *smp_chan_create(struct l2cap_conn *conn)
if (!smp)
return NULL;
- smp->tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
+ smp->tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(smp->tfm_aes)) {
BT_ERR("Unable to create ECB crypto context");
kzfree(smp);
return NULL;
}
- smp->tfm_cmac = crypto_alloc_hash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
+ smp->tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
if (IS_ERR(smp->tfm_cmac)) {
BT_ERR("Unable to create CMAC crypto context");
- crypto_free_blkcipher(smp->tfm_aes);
+ crypto_free_skcipher(smp->tfm_aes);
kzfree(smp);
return NULL;
}
@@ -3027,8 +3004,13 @@ static void smp_ready_cb(struct l2cap_chan *chan)
BT_DBG("chan %p", chan);
+ /* No need to call l2cap_chan_hold() here since we already own
+ * the reference taken in smp_new_conn_cb(). This is just the
+ * first time that we tie it to a specific pointer. The code in
+ * l2cap_core.c ensures that there's no risk this function wont
+ * get called if smp_new_conn_cb was previously called.
+ */
conn->smp = chan;
- l2cap_chan_hold(chan);
if (hcon->type == ACL_LINK && test_bit(HCI_CONN_ENCRYPT, &hcon->flags))
bredr_pairing(chan);
@@ -3138,8 +3120,8 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
{
struct l2cap_chan *chan;
struct smp_dev *smp;
- struct crypto_blkcipher *tfm_aes;
- struct crypto_hash *tfm_cmac;
+ struct crypto_skcipher *tfm_aes;
+ struct crypto_shash *tfm_cmac;
if (cid == L2CAP_CID_SMP_BREDR) {
smp = NULL;
@@ -3150,17 +3132,17 @@ static struct l2cap_chan *smp_add_cid(struct hci_dev *hdev, u16 cid)
if (!smp)
return ERR_PTR(-ENOMEM);
- tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
+ tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm_aes)) {
BT_ERR("Unable to create ECB crypto context");
kzfree(smp);
return ERR_CAST(tfm_aes);
}
- tfm_cmac = crypto_alloc_hash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
+ tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, 0);
if (IS_ERR(tfm_cmac)) {
BT_ERR("Unable to create CMAC crypto context");
- crypto_free_blkcipher(tfm_aes);
+ crypto_free_skcipher(tfm_aes);
kzfree(smp);
return ERR_CAST(tfm_cmac);
}
@@ -3174,8 +3156,8 @@ create_chan:
chan = l2cap_chan_create();
if (!chan) {
if (smp) {
- crypto_free_blkcipher(smp->tfm_aes);
- crypto_free_hash(smp->tfm_cmac);
+ crypto_free_skcipher(smp->tfm_aes);
+ crypto_free_shash(smp->tfm_cmac);
kzfree(smp);
}
return ERR_PTR(-ENOMEM);
@@ -3221,10 +3203,8 @@ static void smp_del_chan(struct l2cap_chan *chan)
smp = chan->data;
if (smp) {
chan->data = NULL;
- if (smp->tfm_aes)
- crypto_free_blkcipher(smp->tfm_aes);
- if (smp->tfm_cmac)
- crypto_free_hash(smp->tfm_cmac);
+ crypto_free_skcipher(smp->tfm_aes);
+ crypto_free_shash(smp->tfm_cmac);
kzfree(smp);
}
@@ -3460,7 +3440,7 @@ void smp_unregister(struct hci_dev *hdev)
#if IS_ENABLED(CONFIG_BT_SELFTEST_SMP)
-static int __init test_ah(struct crypto_blkcipher *tfm_aes)
+static int __init test_ah(struct crypto_skcipher *tfm_aes)
{
const u8 irk[16] = {
0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34,
@@ -3480,7 +3460,7 @@ static int __init test_ah(struct crypto_blkcipher *tfm_aes)
return 0;
}
-static int __init test_c1(struct crypto_blkcipher *tfm_aes)
+static int __init test_c1(struct crypto_skcipher *tfm_aes)
{
const u8 k[16] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -3510,7 +3490,7 @@ static int __init test_c1(struct crypto_blkcipher *tfm_aes)
return 0;
}
-static int __init test_s1(struct crypto_blkcipher *tfm_aes)
+static int __init test_s1(struct crypto_skcipher *tfm_aes)
{
const u8 k[16] = {
0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
@@ -3535,7 +3515,7 @@ static int __init test_s1(struct crypto_blkcipher *tfm_aes)
return 0;
}
-static int __init test_f4(struct crypto_hash *tfm_cmac)
+static int __init test_f4(struct crypto_shash *tfm_cmac)
{
const u8 u[32] = {
0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc,
@@ -3567,7 +3547,7 @@ static int __init test_f4(struct crypto_hash *tfm_cmac)
return 0;
}
-static int __init test_f5(struct crypto_hash *tfm_cmac)
+static int __init test_f5(struct crypto_shash *tfm_cmac)
{
const u8 w[32] = {
0x98, 0xa6, 0xbf, 0x73, 0xf3, 0x34, 0x8d, 0x86,
@@ -3604,7 +3584,7 @@ static int __init test_f5(struct crypto_hash *tfm_cmac)
return 0;
}
-static int __init test_f6(struct crypto_hash *tfm_cmac)
+static int __init test_f6(struct crypto_shash *tfm_cmac)
{
const u8 w[16] = {
0x20, 0x6e, 0x63, 0xce, 0x20, 0x6a, 0x3f, 0xfd,
@@ -3637,7 +3617,7 @@ static int __init test_f6(struct crypto_hash *tfm_cmac)
return 0;
}
-static int __init test_g2(struct crypto_hash *tfm_cmac)
+static int __init test_g2(struct crypto_shash *tfm_cmac)
{
const u8 u[32] = {
0xe6, 0x9d, 0x35, 0x0e, 0x48, 0x01, 0x03, 0xcc,
@@ -3669,7 +3649,7 @@ static int __init test_g2(struct crypto_hash *tfm_cmac)
return 0;
}
-static int __init test_h6(struct crypto_hash *tfm_cmac)
+static int __init test_h6(struct crypto_shash *tfm_cmac)
{
const u8 w[16] = {
0x9b, 0x7d, 0x39, 0x0a, 0xa6, 0x10, 0x10, 0x34,
@@ -3706,8 +3686,8 @@ static const struct file_operations test_smp_fops = {
.llseek = default_llseek,
};
-static int __init run_selftests(struct crypto_blkcipher *tfm_aes,
- struct crypto_hash *tfm_cmac)
+static int __init run_selftests(struct crypto_skcipher *tfm_aes,
+ struct crypto_shash *tfm_cmac)
{
ktime_t calltime, delta, rettime;
unsigned long long duration;
@@ -3784,27 +3764,27 @@ done:
int __init bt_selftest_smp(void)
{
- struct crypto_blkcipher *tfm_aes;
- struct crypto_hash *tfm_cmac;
+ struct crypto_skcipher *tfm_aes;
+ struct crypto_shash *tfm_cmac;
int err;
- tfm_aes = crypto_alloc_blkcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
+ tfm_aes = crypto_alloc_skcipher("ecb(aes)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm_aes)) {
BT_ERR("Unable to create ECB crypto context");
return PTR_ERR(tfm_aes);
}
- tfm_cmac = crypto_alloc_hash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
+ tfm_cmac = crypto_alloc_shash("cmac(aes)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(tfm_cmac)) {
BT_ERR("Unable to create CMAC crypto context");
- crypto_free_blkcipher(tfm_aes);
+ crypto_free_skcipher(tfm_aes);
return PTR_ERR(tfm_cmac);
}
err = run_selftests(tfm_aes, tfm_cmac);
- crypto_free_hash(tfm_cmac);
- crypto_free_blkcipher(tfm_aes);
+ crypto_free_shash(tfm_cmac);
+ crypto_free_skcipher(tfm_aes);
return err;
}
diff --git a/net/bridge/br.c b/net/bridge/br.c
index a1abe4936fe1..3addc05b9a16 100644
--- a/net/bridge/br.c
+++ b/net/bridge/br.c
@@ -121,6 +121,7 @@ static struct notifier_block br_device_notifier = {
.notifier_call = br_device_event
};
+/* called with RTNL */
static int br_switchdev_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
@@ -130,7 +131,6 @@ static int br_switchdev_event(struct notifier_block *unused,
struct switchdev_notifier_fdb_info *fdb_info;
int err = NOTIFY_DONE;
- rtnl_lock();
p = br_port_get_rtnl(dev);
if (!p)
goto out;
@@ -155,7 +155,6 @@ static int br_switchdev_event(struct notifier_block *unused,
}
out:
- rtnl_unlock();
return err;
}
diff --git a/net/bridge/br_device.c b/net/bridge/br_device.c
index 5e88d3e17546..2c8095a5d824 100644
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -28,6 +28,8 @@
const struct nf_br_ops __rcu *nf_br_ops __read_mostly;
EXPORT_SYMBOL_GPL(nf_br_ops);
+static struct lock_class_key bridge_netdev_addr_lock_key;
+
/* net device transmit always called with BH disabled */
netdev_tx_t br_dev_xmit(struct sk_buff *skb, struct net_device *dev)
{
@@ -87,6 +89,11 @@ out:
return NETDEV_TX_OK;
}
+static void br_set_lockdep_class(struct net_device *dev)
+{
+ lockdep_set_class(&dev->addr_list_lock, &bridge_netdev_addr_lock_key);
+}
+
static int br_dev_init(struct net_device *dev)
{
struct net_bridge *br = netdev_priv(dev);
@@ -99,6 +106,7 @@ static int br_dev_init(struct net_device *dev)
err = br_vlan_init(br);
if (err)
free_percpu(br->stats);
+ br_set_lockdep_class(dev);
return err;
}
diff --git a/net/bridge/br_fdb.c b/net/bridge/br_fdb.c
index a642bb829d09..dcea4f4c62b3 100644
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -135,6 +135,7 @@ static void fdb_del_external_learn(struct net_bridge_fdb_entry *f)
{
struct switchdev_obj_port_fdb fdb = {
.obj = {
+ .orig_dev = f->dst->dev,
.id = SWITCHDEV_OBJ_ID_PORT_FDB,
.flags = SWITCHDEV_F_DEFER,
},
@@ -722,6 +723,8 @@ int br_fdb_dump(struct sk_buff *skb,
struct net_bridge_fdb_entry *f;
hlist_for_each_entry_rcu(f, &br->hash[i], hlist) {
+ int err;
+
if (idx < cb->args[0])
goto skip;
@@ -740,12 +743,15 @@ int br_fdb_dump(struct sk_buff *skb,
if (!filter_dev && f->dst)
goto skip;
- if (fdb_fill_info(skb, br, f,
- NETLINK_CB(cb->skb).portid,
- cb->nlh->nlmsg_seq,
- RTM_NEWNEIGH,
- NLM_F_MULTI) < 0)
+ err = fdb_fill_info(skb, br, f,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ RTM_NEWNEIGH,
+ NLM_F_MULTI);
+ if (err < 0) {
+ cb->args[1] = err;
break;
+ }
skip:
++idx;
}
diff --git a/net/bridge/br_forward.c b/net/bridge/br_forward.c
index fcdb86dd5a23..f47759f05b6d 100644
--- a/net/bridge/br_forward.c
+++ b/net/bridge/br_forward.c
@@ -44,7 +44,6 @@ int br_dev_queue_push_xmit(struct net *net, struct sock *sk, struct sk_buff *skb
skb_push(skb, ETH_HLEN);
br_drop_fake_rtable(skb);
- skb_sender_cpu_clear(skb);
if (skb->ip_summed == CHECKSUM_PARTIAL &&
(skb->protocol == htons(ETH_P_8021Q) ||
diff --git a/net/bridge/br_if.c b/net/bridge/br_if.c
index ec02f5869a78..8217aecf025b 100644
--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -36,10 +36,10 @@
*/
static int port_cost(struct net_device *dev)
{
- struct ethtool_cmd ecmd;
+ struct ethtool_link_ksettings ecmd;
- if (!__ethtool_get_settings(dev, &ecmd)) {
- switch (ethtool_cmd_speed(&ecmd)) {
+ if (!__ethtool_get_link_ksettings(dev, &ecmd)) {
+ switch (ecmd.base.speed) {
case SPEED_10000:
return 2;
case SPEED_1000:
@@ -223,6 +223,31 @@ static void destroy_nbp_rcu(struct rcu_head *head)
destroy_nbp(p);
}
+static unsigned get_max_headroom(struct net_bridge *br)
+{
+ unsigned max_headroom = 0;
+ struct net_bridge_port *p;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ unsigned dev_headroom = netdev_get_fwd_headroom(p->dev);
+
+ if (dev_headroom > max_headroom)
+ max_headroom = dev_headroom;
+ }
+
+ return max_headroom;
+}
+
+static void update_headroom(struct net_bridge *br, int new_hr)
+{
+ struct net_bridge_port *p;
+
+ list_for_each_entry(p, &br->port_list, list)
+ netdev_set_rx_headroom(p->dev, new_hr);
+
+ br->dev->needed_headroom = new_hr;
+}
+
/* Delete port(interface) from bridge is done in two steps.
* via RCU. First step, marks device as down. That deletes
* all the timers and stops new packets from flowing through.
@@ -248,6 +273,9 @@ static void del_nbp(struct net_bridge_port *p)
br_ifinfo_notify(RTM_DELLINK, p);
list_del_rcu(&p->list);
+ if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom)
+ update_headroom(br, get_max_headroom(br));
+ netdev_reset_rx_headroom(dev);
nbp_vlan_flush(p);
br_fdb_delete_by_port(br, p, 0, 1);
@@ -409,6 +437,20 @@ int br_min_mtu(const struct net_bridge *br)
return mtu;
}
+static void br_set_gso_limits(struct net_bridge *br)
+{
+ unsigned int gso_max_size = GSO_MAX_SIZE;
+ u16 gso_max_segs = GSO_MAX_SEGS;
+ const struct net_bridge_port *p;
+
+ list_for_each_entry(p, &br->port_list, list) {
+ gso_max_size = min(gso_max_size, p->dev->gso_max_size);
+ gso_max_segs = min(gso_max_segs, p->dev->gso_max_segs);
+ }
+ br->dev->gso_max_size = gso_max_size;
+ br->dev->gso_max_segs = gso_max_segs;
+}
+
/*
* Recomputes features using slave's features
*/
@@ -438,6 +480,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
{
struct net_bridge_port *p;
int err = 0;
+ unsigned br_hr, dev_hr;
bool changed_addr;
/* Don't allow bridging non-ethernet like devices, or DSA-enabled
@@ -493,7 +536,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
dev->priv_flags |= IFF_BRIDGE_PORT;
- err = netdev_master_upper_dev_link(dev, br->dev);
+ err = netdev_master_upper_dev_link(dev, br->dev, NULL, NULL);
if (err)
goto err5;
@@ -505,14 +548,21 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
netdev_update_features(br->dev);
- if (br->dev->needed_headroom < dev->needed_headroom)
- br->dev->needed_headroom = dev->needed_headroom;
+ br_hr = br->dev->needed_headroom;
+ dev_hr = netdev_get_fwd_headroom(dev);
+ if (br_hr < dev_hr)
+ update_headroom(br, dev_hr);
+ else
+ netdev_set_rx_headroom(dev, br_hr);
if (br_fdb_insert(br, p, dev->dev_addr, 0))
netdev_err(dev, "failed insert local address bridge forwarding table\n");
- if (nbp_vlan_init(p))
+ err = nbp_vlan_init(p);
+ if (err) {
netdev_err(dev, "failed to initialize vlan filtering on this port\n");
+ goto err6;
+ }
spin_lock_bh(&br->lock);
changed_addr = br_stp_recalculate_bridge_id(br);
@@ -528,11 +578,18 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
call_netdevice_notifiers(NETDEV_CHANGEADDR, br->dev);
dev_set_mtu(br->dev, br_min_mtu(br));
+ br_set_gso_limits(br);
kobject_uevent(&p->kobj, KOBJ_ADD);
return 0;
+err6:
+ list_del_rcu(&p->list);
+ br_fdb_delete_by_port(br, p, 0, 1);
+ nbp_update_port_count(br);
+ netdev_upper_dev_unlink(dev, br->dev);
+
err5:
dev->priv_flags &= ~IFF_BRIDGE_PORT;
netdev_rx_handler_unregister(dev);
@@ -568,6 +625,7 @@ int br_del_if(struct net_bridge *br, struct net_device *dev)
del_nbp(p);
dev_set_mtu(br->dev, br_min_mtu(br));
+ br_set_gso_limits(br);
spin_lock_bh(&br->lock);
changed_addr = br_stp_recalculate_bridge_id(br);
diff --git a/net/bridge/br_input.c b/net/bridge/br_input.c
index f7fba74108a9..160797722228 100644
--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -222,7 +222,10 @@ static int br_handle_local_finish(struct net *net, struct sock *sk, struct sk_bu
/* check if vlan is allowed, to avoid spoofing */
if (p->flags & BR_LEARNING && br_should_learn(p, skb, &vid))
br_fdb_update(p->br, p, eth_hdr(skb)->h_source, vid, false);
- return 0; /* process further */
+
+ BR_INPUT_SKB_CB(skb)->brdev = p->br->dev;
+ br_pass_frame_up(skb);
+ return 0;
}
/*
@@ -284,14 +287,9 @@ rx_handler_result_t br_handle_frame(struct sk_buff **pskb)
}
/* Deliver packet to local host only */
- if (NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN,
- dev_net(skb->dev), NULL, skb, skb->dev, NULL,
- br_handle_local_finish)) {
- return RX_HANDLER_CONSUMED; /* consumed by filter */
- } else {
- *pskb = skb;
- return RX_HANDLER_PASS; /* continue processing */
- }
+ NF_HOOK(NFPROTO_BRIDGE, NF_BR_LOCAL_IN, dev_net(skb->dev),
+ NULL, skb, skb->dev, NULL, br_handle_local_finish);
+ return RX_HANDLER_CONSUMED;
}
forward:
diff --git a/net/bridge/br_mdb.c b/net/bridge/br_mdb.c
index cd8deea2d074..253bc77eda3b 100644
--- a/net/bridge/br_mdb.c
+++ b/net/bridge/br_mdb.c
@@ -7,6 +7,7 @@
#include <linux/if_ether.h>
#include <net/ip.h>
#include <net/netlink.h>
+#include <net/switchdev.h>
#if IS_ENABLED(CONFIG_IPV6)
#include <net/ipv6.h>
#include <net/addrconf.h>
@@ -19,7 +20,7 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
{
struct net_bridge *br = netdev_priv(dev);
struct net_bridge_port *p;
- struct nlattr *nest;
+ struct nlattr *nest, *port_nest;
if (!br->multicast_router || hlist_empty(&br->router_list))
return 0;
@@ -29,8 +30,20 @@ static int br_rports_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
return -EMSGSIZE;
hlist_for_each_entry_rcu(p, &br->router_list, rlist) {
- if (p && nla_put_u32(skb, MDBA_ROUTER_PORT, p->dev->ifindex))
+ if (!p)
+ continue;
+ port_nest = nla_nest_start(skb, MDBA_ROUTER_PORT);
+ if (!port_nest)
goto fail;
+ if (nla_put_nohdr(skb, sizeof(u32), &p->dev->ifindex) ||
+ nla_put_u32(skb, MDBA_ROUTER_PATTR_TIMER,
+ br_timer_value(&p->multicast_router_timer)) ||
+ nla_put_u8(skb, MDBA_ROUTER_PATTR_TYPE,
+ p->multicast_router)) {
+ nla_nest_cancel(skb, port_nest);
+ goto fail;
+ }
+ nla_nest_end(skb, port_nest);
}
nla_nest_end(skb, nest);
@@ -40,6 +53,14 @@ fail:
return -EMSGSIZE;
}
+static void __mdb_entry_fill_flags(struct br_mdb_entry *e, unsigned char flags)
+{
+ e->state = flags & MDB_PG_FLAGS_PERMANENT;
+ e->flags = 0;
+ if (flags & MDB_PG_FLAGS_OFFLOAD)
+ e->flags |= MDB_FLAGS_OFFLOAD;
+}
+
static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev)
{
@@ -79,26 +100,41 @@ static int br_mdb_fill_info(struct sk_buff *skb, struct netlink_callback *cb,
for (pp = &mp->ports;
(p = rcu_dereference(*pp)) != NULL;
pp = &p->next) {
+ struct nlattr *nest_ent;
+ struct br_mdb_entry e;
+
port = p->port;
- if (port) {
- struct br_mdb_entry e;
- memset(&e, 0, sizeof(e));
- e.ifindex = port->dev->ifindex;
- e.state = p->state;
- e.vid = p->addr.vid;
- if (p->addr.proto == htons(ETH_P_IP))
- e.addr.u.ip4 = p->addr.u.ip4;
+ if (!port)
+ continue;
+
+ memset(&e, 0, sizeof(e));
+ e.ifindex = port->dev->ifindex;
+ e.vid = p->addr.vid;
+ __mdb_entry_fill_flags(&e, p->flags);
+ if (p->addr.proto == htons(ETH_P_IP))
+ e.addr.u.ip4 = p->addr.u.ip4;
#if IS_ENABLED(CONFIG_IPV6)
- if (p->addr.proto == htons(ETH_P_IPV6))
- e.addr.u.ip6 = p->addr.u.ip6;
+ if (p->addr.proto == htons(ETH_P_IPV6))
+ e.addr.u.ip6 = p->addr.u.ip6;
#endif
- e.addr.proto = p->addr.proto;
- if (nla_put(skb, MDBA_MDB_ENTRY_INFO, sizeof(e), &e)) {
- nla_nest_cancel(skb, nest2);
- err = -EMSGSIZE;
- goto out;
- }
+ e.addr.proto = p->addr.proto;
+ nest_ent = nla_nest_start(skb,
+ MDBA_MDB_ENTRY_INFO);
+ if (!nest_ent) {
+ nla_nest_cancel(skb, nest2);
+ err = -EMSGSIZE;
+ goto out;
}
+ if (nla_put_nohdr(skb, sizeof(e), &e) ||
+ nla_put_u32(skb,
+ MDBA_MDB_EATTR_TIMER,
+ br_timer_value(&p->timer))) {
+ nla_nest_cancel(skb, nest_ent);
+ nla_nest_cancel(skb, nest2);
+ err = -EMSGSIZE;
+ goto out;
+ }
+ nla_nest_end(skb, nest_ent);
}
nla_nest_end(skb, nest2);
skip:
@@ -208,12 +244,37 @@ static inline size_t rtnl_mdb_nlmsg_size(void)
}
static void __br_mdb_notify(struct net_device *dev, struct br_mdb_entry *entry,
- int type)
+ int type, struct net_bridge_port_group *pg)
{
+ struct switchdev_obj_port_mdb mdb = {
+ .obj = {
+ .id = SWITCHDEV_OBJ_ID_PORT_MDB,
+ .flags = SWITCHDEV_F_DEFER,
+ },
+ .vid = entry->vid,
+ };
+ struct net_device *port_dev;
struct net *net = dev_net(dev);
struct sk_buff *skb;
int err = -ENOBUFS;
+ port_dev = __dev_get_by_index(net, entry->ifindex);
+ if (entry->addr.proto == htons(ETH_P_IP))
+ ip_eth_mc_map(entry->addr.u.ip4, mdb.addr);
+#if IS_ENABLED(CONFIG_IPV6)
+ else
+ ipv6_eth_mc_map(&entry->addr.u.ip6, mdb.addr);
+#endif
+
+ mdb.obj.orig_dev = port_dev;
+ if (port_dev && type == RTM_NEWMDB) {
+ err = switchdev_port_obj_add(port_dev, &mdb.obj);
+ if (!err && pg)
+ pg->flags |= MDB_PG_FLAGS_OFFLOAD;
+ } else if (port_dev && type == RTM_DELMDB) {
+ switchdev_port_obj_del(port_dev, &mdb.obj);
+ }
+
skb = nlmsg_new(rtnl_mdb_nlmsg_size(), GFP_ATOMIC);
if (!skb)
goto errout;
@@ -230,21 +291,21 @@ errout:
rtnl_set_sk_err(net, RTNLGRP_MDB, err);
}
-void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
- struct br_ip *group, int type, u8 state)
+void br_mdb_notify(struct net_device *dev, struct net_bridge_port_group *pg,
+ int type)
{
struct br_mdb_entry entry;
memset(&entry, 0, sizeof(entry));
- entry.ifindex = port->dev->ifindex;
- entry.addr.proto = group->proto;
- entry.addr.u.ip4 = group->u.ip4;
+ entry.ifindex = pg->port->dev->ifindex;
+ entry.addr.proto = pg->addr.proto;
+ entry.addr.u.ip4 = pg->addr.u.ip4;
#if IS_ENABLED(CONFIG_IPV6)
- entry.addr.u.ip6 = group->u.ip6;
+ entry.addr.u.ip6 = pg->addr.u.ip6;
#endif
- entry.state = state;
- entry.vid = group->vid;
- __br_mdb_notify(dev, &entry, type);
+ entry.vid = pg->addr.vid;
+ __mdb_entry_fill_flags(&entry, pg->flags);
+ __br_mdb_notify(dev, &entry, type, pg);
}
static int nlmsg_populate_rtr_fill(struct sk_buff *skb,
@@ -389,7 +450,8 @@ static int br_mdb_parse(struct sk_buff *skb, struct nlmsghdr *nlh,
}
static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
- struct br_ip *group, unsigned char state)
+ struct br_ip *group, unsigned char state,
+ struct net_bridge_port_group **pg)
{
struct net_bridge_mdb_entry *mp;
struct net_bridge_port_group *p;
@@ -402,8 +464,8 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
mp = br_mdb_ip_get(mdb, group);
if (!mp) {
mp = br_multicast_new_group(br, port, group);
- err = PTR_ERR(mp);
- if (IS_ERR(mp))
+ err = PTR_ERR_OR_ZERO(mp);
+ if (err)
return err;
}
@@ -420,6 +482,7 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
if (unlikely(!p))
return -ENOMEM;
rcu_assign_pointer(*pp, p);
+ *pg = p;
if (state == MDB_TEMPORARY)
mod_timer(&p->timer, now + br->multicast_membership_interval);
@@ -427,7 +490,8 @@ static int br_mdb_add_group(struct net_bridge *br, struct net_bridge_port *port,
}
static int __br_mdb_add(struct net *net, struct net_bridge *br,
- struct br_mdb_entry *entry)
+ struct br_mdb_entry *entry,
+ struct net_bridge_port_group **pg)
{
struct br_ip ip;
struct net_device *dev;
@@ -456,7 +520,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
#endif
spin_lock_bh(&br->multicast_lock);
- ret = br_mdb_add_group(br, p, &ip, entry->state);
+ ret = br_mdb_add_group(br, p, &ip, entry->state, pg);
spin_unlock_bh(&br->multicast_lock);
return ret;
}
@@ -464,6 +528,7 @@ static int __br_mdb_add(struct net *net, struct net_bridge *br,
static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
+ struct net_bridge_port_group *pg;
struct net_bridge_vlan_group *vg;
struct net_device *dev, *pdev;
struct br_mdb_entry *entry;
@@ -493,15 +558,15 @@ static int br_mdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
if (br_vlan_enabled(br) && vg && entry->vid == 0) {
list_for_each_entry(v, &vg->vlan_list, vlist) {
entry->vid = v->vid;
- err = __br_mdb_add(net, br, entry);
+ err = __br_mdb_add(net, br, entry, &pg);
if (err)
break;
- __br_mdb_notify(dev, entry, RTM_NEWMDB);
+ __br_mdb_notify(dev, entry, RTM_NEWMDB, pg);
}
} else {
- err = __br_mdb_add(net, br, entry);
+ err = __br_mdb_add(net, br, entry, &pg);
if (!err)
- __br_mdb_notify(dev, entry, RTM_NEWMDB);
+ __br_mdb_notify(dev, entry, RTM_NEWMDB, pg);
}
return err;
@@ -545,7 +610,7 @@ static int __br_mdb_del(struct net_bridge *br, struct br_mdb_entry *entry)
if (p->port->state == BR_STATE_DISABLED)
goto unlock;
- entry->state = p->state;
+ __mdb_entry_fill_flags(entry, p->flags);
rcu_assign_pointer(*pp, p->next);
hlist_del_init(&p->mglist);
del_timer(&p->timer);
@@ -597,12 +662,12 @@ static int br_mdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
entry->vid = v->vid;
err = __br_mdb_del(br, entry);
if (!err)
- __br_mdb_notify(dev, entry, RTM_DELMDB);
+ __br_mdb_notify(dev, entry, RTM_DELMDB, NULL);
}
} else {
err = __br_mdb_del(br, entry);
if (!err)
- __br_mdb_notify(dev, entry, RTM_DELMDB);
+ __br_mdb_notify(dev, entry, RTM_DELMDB, NULL);
}
return err;
diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 03661d97463c..a4c15df2b792 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -283,8 +283,7 @@ static void br_multicast_del_pg(struct net_bridge *br,
rcu_assign_pointer(*pp, p->next);
hlist_del_init(&p->mglist);
del_timer(&p->timer);
- br_mdb_notify(br->dev, p->port, &pg->addr, RTM_DELMDB,
- p->state);
+ br_mdb_notify(br->dev, p, RTM_DELMDB);
call_rcu_bh(&p->rcu, br_multicast_free_pg);
if (!mp->ports && !mp->mglist &&
@@ -304,7 +303,7 @@ static void br_multicast_port_group_expired(unsigned long data)
spin_lock(&br->multicast_lock);
if (!netif_running(br->dev) || timer_pending(&pg->timer) ||
- hlist_unhashed(&pg->mglist) || pg->state & MDB_PERMANENT)
+ hlist_unhashed(&pg->mglist) || pg->flags & MDB_PG_FLAGS_PERMANENT)
goto out;
br_multicast_del_pg(br, pg);
@@ -649,7 +648,7 @@ struct net_bridge_port_group *br_multicast_new_port_group(
struct net_bridge_port *port,
struct br_ip *group,
struct net_bridge_port_group __rcu *next,
- unsigned char state)
+ unsigned char flags)
{
struct net_bridge_port_group *p;
@@ -659,7 +658,7 @@ struct net_bridge_port_group *br_multicast_new_port_group(
p->addr = *group;
p->port = port;
- p->state = state;
+ p->flags = flags;
rcu_assign_pointer(p->next, next);
hlist_add_head(&p->mglist, &port->mglist);
setup_timer(&p->timer, br_multicast_port_group_expired,
@@ -702,11 +701,11 @@ static int br_multicast_add_group(struct net_bridge *br,
break;
}
- p = br_multicast_new_port_group(port, group, *pp, MDB_TEMPORARY);
+ p = br_multicast_new_port_group(port, group, *pp, 0);
if (unlikely(!p))
goto err;
rcu_assign_pointer(*pp, p);
- br_mdb_notify(br->dev, port, group, RTM_NEWMDB, MDB_TEMPORARY);
+ br_mdb_notify(br->dev, p, RTM_NEWMDB);
found:
mod_timer(&p->timer, now + br->multicast_membership_interval);
@@ -760,13 +759,17 @@ static void br_multicast_router_expired(unsigned long data)
struct net_bridge *br = port->br;
spin_lock(&br->multicast_lock);
- if (port->multicast_router != 1 ||
+ if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
+ port->multicast_router == MDB_RTR_TYPE_PERM ||
timer_pending(&port->multicast_router_timer) ||
hlist_unhashed(&port->rlist))
goto out;
hlist_del_init_rcu(&port->rlist);
br_rtr_notify(br->dev, port, RTM_DELMDB);
+ /* Don't allow timer refresh if the router expired */
+ if (port->multicast_router == MDB_RTR_TYPE_TEMP)
+ port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
out:
spin_unlock(&br->multicast_lock);
@@ -913,7 +916,7 @@ static void br_ip6_multicast_port_query_expired(unsigned long data)
void br_multicast_add_port(struct net_bridge_port *port)
{
- port->multicast_router = 1;
+ port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
setup_timer(&port->multicast_router_timer, br_multicast_router_expired,
(unsigned long)port);
@@ -960,7 +963,8 @@ void br_multicast_enable_port(struct net_bridge_port *port)
#if IS_ENABLED(CONFIG_IPV6)
br_multicast_enable(&port->ip6_own_query);
#endif
- if (port->multicast_router == 2 && hlist_unhashed(&port->rlist))
+ if (port->multicast_router == MDB_RTR_TYPE_PERM &&
+ hlist_unhashed(&port->rlist))
br_multicast_add_router(br, port);
out:
@@ -975,12 +979,15 @@ void br_multicast_disable_port(struct net_bridge_port *port)
spin_lock(&br->multicast_lock);
hlist_for_each_entry_safe(pg, n, &port->mglist, mglist)
- if (pg->state == MDB_TEMPORARY)
+ if (!(pg->flags & MDB_PG_FLAGS_PERMANENT))
br_multicast_del_pg(br, pg);
if (!hlist_unhashed(&port->rlist)) {
hlist_del_init_rcu(&port->rlist);
br_rtr_notify(br->dev, port, RTM_DELMDB);
+ /* Don't allow timer refresh if disabling */
+ if (port->multicast_router == MDB_RTR_TYPE_TEMP)
+ port->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
}
del_timer(&port->multicast_router_timer);
del_timer(&port->ip4_own_query.timer);
@@ -1228,13 +1235,14 @@ static void br_multicast_mark_router(struct net_bridge *br,
unsigned long now = jiffies;
if (!port) {
- if (br->multicast_router == 1)
+ if (br->multicast_router == MDB_RTR_TYPE_TEMP_QUERY)
mod_timer(&br->multicast_router_timer,
now + br->multicast_querier_interval);
return;
}
- if (port->multicast_router != 1)
+ if (port->multicast_router == MDB_RTR_TYPE_DISABLED ||
+ port->multicast_router == MDB_RTR_TYPE_PERM)
return;
br_multicast_add_router(br, port);
@@ -1453,8 +1461,7 @@ br_multicast_leave_group(struct net_bridge *br,
hlist_del_init(&p->mglist);
del_timer(&p->timer);
call_rcu_bh(&p->rcu, br_multicast_free_pg);
- br_mdb_notify(br->dev, port, group, RTM_DELMDB,
- p->state);
+ br_mdb_notify(br->dev, p, RTM_DELMDB);
if (!mp->ports && !mp->mglist &&
netif_running(br->dev))
@@ -1715,7 +1722,7 @@ void br_multicast_init(struct net_bridge *br)
br->hash_elasticity = 4;
br->hash_max = 512;
- br->multicast_router = 1;
+ br->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
br->multicast_querier = 0;
br->multicast_query_use_ifaddr = 0;
br->multicast_last_member_count = 2;
@@ -1825,11 +1832,11 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
spin_lock_bh(&br->multicast_lock);
switch (val) {
- case 0:
- case 2:
+ case MDB_RTR_TYPE_DISABLED:
+ case MDB_RTR_TYPE_PERM:
del_timer(&br->multicast_router_timer);
/* fall through */
- case 1:
+ case MDB_RTR_TYPE_TEMP_QUERY:
br->multicast_router = val;
err = 0;
break;
@@ -1840,37 +1847,53 @@ int br_multicast_set_router(struct net_bridge *br, unsigned long val)
return err;
}
+static void __del_port_router(struct net_bridge_port *p)
+{
+ if (hlist_unhashed(&p->rlist))
+ return;
+ hlist_del_init_rcu(&p->rlist);
+ br_rtr_notify(p->br->dev, p, RTM_DELMDB);
+}
+
int br_multicast_set_port_router(struct net_bridge_port *p, unsigned long val)
{
struct net_bridge *br = p->br;
+ unsigned long now = jiffies;
int err = -EINVAL;
spin_lock(&br->multicast_lock);
-
- switch (val) {
- case 0:
- case 1:
- case 2:
- p->multicast_router = val;
+ if (p->multicast_router == val) {
+ /* Refresh the temp router port timer */
+ if (p->multicast_router == MDB_RTR_TYPE_TEMP)
+ mod_timer(&p->multicast_router_timer,
+ now + br->multicast_querier_interval);
err = 0;
-
- if (val < 2 && !hlist_unhashed(&p->rlist)) {
- hlist_del_init_rcu(&p->rlist);
- br_rtr_notify(br->dev, p, RTM_DELMDB);
- }
-
- if (val == 1)
- break;
-
+ goto unlock;
+ }
+ switch (val) {
+ case MDB_RTR_TYPE_DISABLED:
+ p->multicast_router = MDB_RTR_TYPE_DISABLED;
+ __del_port_router(p);
+ del_timer(&p->multicast_router_timer);
+ break;
+ case MDB_RTR_TYPE_TEMP_QUERY:
+ p->multicast_router = MDB_RTR_TYPE_TEMP_QUERY;
+ __del_port_router(p);
+ break;
+ case MDB_RTR_TYPE_PERM:
+ p->multicast_router = MDB_RTR_TYPE_PERM;
del_timer(&p->multicast_router_timer);
-
- if (val == 0)
- break;
-
br_multicast_add_router(br, p);
break;
+ case MDB_RTR_TYPE_TEMP:
+ p->multicast_router = MDB_RTR_TYPE_TEMP;
+ br_multicast_mark_router(br, p);
+ break;
+ default:
+ goto unlock;
}
-
+ err = 0;
+unlock:
spin_unlock(&br->multicast_lock);
return err;
diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 7ddbe7ec81d6..44114a94c576 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -37,6 +37,7 @@
#include <net/addrconf.h>
#include <net/route.h>
#include <net/netfilter/br_netfilter.h>
+#include <net/netns/generic.h>
#include <asm/uaccess.h>
#include "br_private.h"
@@ -44,6 +45,12 @@
#include <linux/sysctl.h>
#endif
+static int brnf_net_id __read_mostly;
+
+struct brnf_net {
+ bool enabled;
+};
+
#ifdef CONFIG_SYSCTL
static struct ctl_table_header *brnf_sysctl_header;
static int brnf_call_iptables __read_mostly = 1;
@@ -938,6 +945,53 @@ static struct nf_hook_ops br_nf_ops[] __read_mostly = {
},
};
+static int brnf_device_event(struct notifier_block *unused, unsigned long event,
+ void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct brnf_net *brnet;
+ struct net *net;
+ int ret;
+
+ if (event != NETDEV_REGISTER || !(dev->priv_flags & IFF_EBRIDGE))
+ return NOTIFY_DONE;
+
+ ASSERT_RTNL();
+
+ net = dev_net(dev);
+ brnet = net_generic(net, brnf_net_id);
+ if (brnet->enabled)
+ return NOTIFY_OK;
+
+ ret = nf_register_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ if (ret)
+ return NOTIFY_BAD;
+
+ brnet->enabled = true;
+ return NOTIFY_OK;
+}
+
+static void __net_exit brnf_exit_net(struct net *net)
+{
+ struct brnf_net *brnet = net_generic(net, brnf_net_id);
+
+ if (!brnet->enabled)
+ return;
+
+ nf_unregister_net_hooks(net, br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ brnet->enabled = false;
+}
+
+static struct pernet_operations brnf_net_ops __read_mostly = {
+ .exit = brnf_exit_net,
+ .id = &brnf_net_id,
+ .size = sizeof(struct brnf_net),
+};
+
+static struct notifier_block brnf_notifier __read_mostly = {
+ .notifier_call = brnf_device_event,
+};
+
#ifdef CONFIG_SYSCTL
static
int brnf_sysctl_call_tables(struct ctl_table *ctl, int write,
@@ -1003,16 +1057,23 @@ static int __init br_netfilter_init(void)
{
int ret;
- ret = nf_register_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ ret = register_pernet_subsys(&brnf_net_ops);
if (ret < 0)
return ret;
+ ret = register_netdevice_notifier(&brnf_notifier);
+ if (ret < 0) {
+ unregister_pernet_subsys(&brnf_net_ops);
+ return ret;
+ }
+
#ifdef CONFIG_SYSCTL
brnf_sysctl_header = register_net_sysctl(&init_net, "net/bridge", brnf_table);
if (brnf_sysctl_header == NULL) {
printk(KERN_WARNING
"br_netfilter: can't register to sysctl.\n");
- nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ unregister_netdevice_notifier(&brnf_notifier);
+ unregister_pernet_subsys(&brnf_net_ops);
return -ENOMEM;
}
#endif
@@ -1024,7 +1085,8 @@ static int __init br_netfilter_init(void)
static void __exit br_netfilter_fini(void)
{
RCU_INIT_POINTER(nf_br_ops, NULL);
- nf_unregister_hooks(br_nf_ops, ARRAY_SIZE(br_nf_ops));
+ unregister_netdevice_notifier(&brnf_notifier);
+ unregister_pernet_subsys(&brnf_net_ops);
#ifdef CONFIG_SYSCTL
unregister_net_sysctl_table(brnf_sysctl_header);
#endif
diff --git a/net/bridge/br_netlink.c b/net/bridge/br_netlink.c
index 40197ff8918a..e9c635eae24d 100644
--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -598,7 +598,6 @@ static int br_set_port_state(struct net_bridge_port *p, u8 state)
return -ENETDOWN;
br_set_state(p, state);
- br_log_state(p);
br_port_state_selection(p->br);
return 0;
}
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index 216018c76018..1b5d145dfcbf 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -150,6 +150,9 @@ struct net_bridge_fdb_entry
struct rcu_head rcu;
};
+#define MDB_PG_FLAGS_PERMANENT BIT(0)
+#define MDB_PG_FLAGS_OFFLOAD BIT(1)
+
struct net_bridge_port_group {
struct net_bridge_port *port;
struct net_bridge_port_group __rcu *next;
@@ -157,7 +160,7 @@ struct net_bridge_port_group {
struct rcu_head rcu;
struct timer_list timer;
struct br_ip addr;
- unsigned char state;
+ unsigned char flags;
};
struct net_bridge_mdb_entry
@@ -554,11 +557,11 @@ void br_multicast_free_pg(struct rcu_head *head);
struct net_bridge_port_group *
br_multicast_new_port_group(struct net_bridge_port *port, struct br_ip *group,
struct net_bridge_port_group __rcu *next,
- unsigned char state);
+ unsigned char flags);
void br_mdb_init(void);
void br_mdb_uninit(void);
-void br_mdb_notify(struct net_device *dev, struct net_bridge_port *port,
- struct br_ip *group, int type, u8 state);
+void br_mdb_notify(struct net_device *dev, struct net_bridge_port_group *pg,
+ int type);
void br_rtr_notify(struct net_device *dev, struct net_bridge_port *port,
int type);
@@ -897,7 +900,6 @@ static inline void br_nf_core_fini(void) {}
#endif
/* br_stp.c */
-void br_log_state(const struct net_bridge_port *p);
void br_set_state(struct net_bridge_port *p, unsigned int state);
struct net_bridge_port *br_get_port(struct net_bridge *br, u16 port_no);
void br_init_port(struct net_bridge_port *p);
diff --git a/net/bridge/br_stp.c b/net/bridge/br_stp.c
index 80c34d70218c..9cb7044d0801 100644
--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -30,16 +30,10 @@ static const char *const br_port_state_names[] = {
[BR_STATE_BLOCKING] = "blocking",
};
-void br_log_state(const struct net_bridge_port *p)
-{
- br_info(p->br, "port %u(%s) entered %s state\n",
- (unsigned int) p->port_no, p->dev->name,
- br_port_state_names[p->state]);
-}
-
void br_set_state(struct net_bridge_port *p, unsigned int state)
{
struct switchdev_attr attr = {
+ .orig_dev = p->dev,
.id = SWITCHDEV_ATTR_ID_PORT_STP_STATE,
.flags = SWITCHDEV_F_DEFER,
.u.stp_state = state,
@@ -48,9 +42,13 @@ void br_set_state(struct net_bridge_port *p, unsigned int state)
p->state = state;
err = switchdev_port_attr_set(p->dev, &attr);
- if (err)
+ if (err && err != -EOPNOTSUPP)
br_warn(p->br, "error setting offload STP state on port %u(%s)\n",
(unsigned int) p->port_no, p->dev->name);
+ else
+ br_info(p->br, "port %u(%s) entered %s state\n",
+ (unsigned int) p->port_no, p->dev->name,
+ br_port_state_names[p->state]);
}
/* called under bridge lock */
@@ -125,7 +123,6 @@ static void br_root_port_block(const struct net_bridge *br,
(unsigned int) p->port_no, p->dev->name);
br_set_state(p, BR_STATE_LISTENING);
- br_log_state(p);
br_ifinfo_notify(RTM_NEWLINK, p);
if (br->forward_delay > 0)
@@ -406,7 +403,6 @@ static void br_make_blocking(struct net_bridge_port *p)
br_topology_change_detection(p->br);
br_set_state(p, BR_STATE_BLOCKING);
- br_log_state(p);
br_ifinfo_notify(RTM_NEWLINK, p);
del_timer(&p->forward_delay_timer);
@@ -430,7 +426,6 @@ static void br_make_forwarding(struct net_bridge_port *p)
else
br_set_state(p, BR_STATE_LEARNING);
- br_log_state(p);
br_ifinfo_notify(RTM_NEWLINK, p);
if (br->forward_delay != 0)
@@ -567,9 +562,18 @@ int br_set_max_age(struct net_bridge *br, unsigned long val)
}
+/* Set time interval that dynamic forwarding entries live
+ * For pure software bridge, allow values outside the 802.1
+ * standard specification for special cases:
+ * 0 - entry never ages (all permanant)
+ * 1 - entry disappears (no persistance)
+ *
+ * Offloaded switch entries maybe more restrictive
+ */
int br_set_ageing_time(struct net_bridge *br, u32 ageing_time)
{
struct switchdev_attr attr = {
+ .orig_dev = br->dev,
.id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
.flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
.u.ageing_time = ageing_time,
@@ -577,11 +581,8 @@ int br_set_ageing_time(struct net_bridge *br, u32 ageing_time)
unsigned long t = clock_t_to_jiffies(ageing_time);
int err;
- if (t < BR_MIN_AGEING_TIME || t > BR_MAX_AGEING_TIME)
- return -ERANGE;
-
err = switchdev_port_attr_set(br->dev, &attr);
- if (err)
+ if (err && err != -EOPNOTSUPP)
return err;
br->ageing_time = t;
@@ -600,12 +601,17 @@ void __br_set_forward_delay(struct net_bridge *br, unsigned long t)
int br_set_forward_delay(struct net_bridge *br, unsigned long val)
{
unsigned long t = clock_t_to_jiffies(val);
-
- if (t < BR_MIN_FORWARD_DELAY || t > BR_MAX_FORWARD_DELAY)
- return -ERANGE;
+ int err = -ERANGE;
spin_lock_bh(&br->lock);
+ if (br->stp_enabled != BR_NO_STP &&
+ (t < BR_MIN_FORWARD_DELAY || t > BR_MAX_FORWARD_DELAY))
+ goto unlock;
+
__br_set_forward_delay(br, t);
+ err = 0;
+
+unlock:
spin_unlock_bh(&br->lock);
- return 0;
+ return err;
}
diff --git a/net/bridge/br_stp_if.c b/net/bridge/br_stp_if.c
index fa53d7a89f48..984d46263007 100644
--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -37,9 +37,10 @@ static inline port_id br_make_port_id(__u8 priority, __u16 port_no)
void br_init_port(struct net_bridge_port *p)
{
struct switchdev_attr attr = {
+ .orig_dev = p->dev,
.id = SWITCHDEV_ATTR_ID_BRIDGE_AGEING_TIME,
.flags = SWITCHDEV_F_SKIP_EOPNOTSUPP | SWITCHDEV_F_DEFER,
- .u.ageing_time = p->br->ageing_time,
+ .u.ageing_time = jiffies_to_clock_t(p->br->ageing_time),
};
int err;
@@ -50,7 +51,7 @@ void br_init_port(struct net_bridge_port *p)
p->config_pending = 0;
err = switchdev_port_attr_set(p->dev, &attr);
- if (err)
+ if (err && err != -EOPNOTSUPP)
netdev_err(p->dev, "failed to set HW ageing time\n");
}
@@ -101,7 +102,6 @@ void br_stp_enable_port(struct net_bridge_port *p)
{
br_init_port(p);
br_port_state_selection(p->br);
- br_log_state(p);
br_ifinfo_notify(RTM_NEWLINK, p);
}
@@ -117,7 +117,6 @@ void br_stp_disable_port(struct net_bridge_port *p)
p->topology_change_ack = 0;
p->config_pending = 0;
- br_log_state(p);
br_ifinfo_notify(RTM_NEWLINK, p);
del_timer(&p->message_age_timer);
@@ -142,7 +141,10 @@ static void br_stp_start(struct net_bridge *br)
char *envp[] = { NULL };
struct net_bridge_port *p;
- r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
+ if (net_eq(dev_net(br->dev), &init_net))
+ r = call_usermodehelper(BR_STP_PROG, argv, envp, UMH_WAIT_PROC);
+ else
+ r = -ENOENT;
spin_lock_bh(&br->lock);
diff --git a/net/bridge/br_stp_timer.c b/net/bridge/br_stp_timer.c
index 5f0f5af0ec35..da058b85aa22 100644
--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -98,7 +98,6 @@ static void br_forward_delay_timer_expired(unsigned long arg)
br_topology_change_detection(br);
netif_carrier_on(br->dev);
}
- br_log_state(p);
rcu_read_lock();
br_ifinfo_notify(RTM_NEWLINK, p);
rcu_read_unlock();
diff --git a/net/bridge/br_sysfs_br.c b/net/bridge/br_sysfs_br.c
index 8365bd53c421..6b8091407ca3 100644
--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -22,7 +22,6 @@
#include "br_private.h"
-#define to_dev(obj) container_of(obj, struct device, kobj)
#define to_bridge(cd) ((struct net_bridge *)netdev_priv(to_net_dev(cd)))
/*
@@ -814,7 +813,7 @@ static ssize_t brforward_read(struct file *filp, struct kobject *kobj,
struct bin_attribute *bin_attr,
char *buf, loff_t off, size_t count)
{
- struct device *dev = to_dev(kobj);
+ struct device *dev = kobj_to_dev(kobj);
struct net_bridge *br = to_bridge(dev);
int n;
diff --git a/net/bridge/br_vlan.c b/net/bridge/br_vlan.c
index 1394da63614a..9309bb4f2a5b 100644
--- a/net/bridge/br_vlan.c
+++ b/net/bridge/br_vlan.c
@@ -73,6 +73,7 @@ static int __vlan_vid_add(struct net_device *dev, struct net_bridge *br,
u16 vid, u16 flags)
{
struct switchdev_obj_port_vlan v = {
+ .obj.orig_dev = dev,
.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
.flags = flags,
.vid_begin = vid,
@@ -120,6 +121,7 @@ static int __vlan_vid_del(struct net_device *dev, struct net_bridge *br,
u16 vid)
{
struct switchdev_obj_port_vlan v = {
+ .obj.orig_dev = dev,
.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
.vid_begin = vid,
.vid_end = vid,
@@ -624,9 +626,21 @@ void br_recalculate_fwd_mask(struct net_bridge *br)
int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
{
+ struct switchdev_attr attr = {
+ .orig_dev = br->dev,
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
+ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
+ .u.vlan_filtering = val,
+ };
+ int err;
+
if (br->vlan_enabled == val)
return 0;
+ err = switchdev_port_attr_set(br->dev, &attr);
+ if (err && err != -EOPNOTSUPP)
+ return err;
+
br->vlan_enabled = val;
br_manage_promisc(br);
recalculate_group_addr(br);
@@ -637,13 +651,15 @@ int __br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
int br_vlan_filter_toggle(struct net_bridge *br, unsigned long val)
{
+ int err;
+
if (!rtnl_trylock())
return restart_syscall();
- __br_vlan_filter_toggle(br, val);
+ err = __br_vlan_filter_toggle(br, val);
rtnl_unlock();
- return 0;
+ return err;
}
int __br_vlan_set_proto(struct net_bridge *br, __be16 proto)
@@ -891,6 +907,12 @@ err_rhtbl:
int nbp_vlan_init(struct net_bridge_port *p)
{
+ struct switchdev_attr attr = {
+ .orig_dev = p->br->dev,
+ .id = SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING,
+ .flags = SWITCHDEV_F_SKIP_EOPNOTSUPP,
+ .u.vlan_filtering = p->br->vlan_enabled,
+ };
struct net_bridge_vlan_group *vg;
int ret = -ENOMEM;
@@ -898,6 +920,10 @@ int nbp_vlan_init(struct net_bridge_port *p)
if (!vg)
goto out;
+ ret = switchdev_port_attr_set(p->dev, &attr);
+ if (ret && ret != -EOPNOTSUPP)
+ goto err_vlan_enabled;
+
ret = rhashtable_init(&vg->vlan_hash, &br_vlan_rht_params);
if (ret)
goto err_rhtbl;
@@ -917,6 +943,7 @@ err_vlan_add:
RCU_INIT_POINTER(p->vlgrp, NULL);
synchronize_rcu();
rhashtable_destroy(&vg->vlan_hash);
+err_vlan_enabled:
err_rhtbl:
kfree(vg);
@@ -928,6 +955,13 @@ err_rhtbl:
*/
int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags)
{
+ struct switchdev_obj_port_vlan v = {
+ .obj.orig_dev = port->dev,
+ .obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
+ .flags = flags,
+ .vid_begin = vid,
+ .vid_end = vid,
+ };
struct net_bridge_vlan *vlan;
int ret;
@@ -935,6 +969,10 @@ int nbp_vlan_add(struct net_bridge_port *port, u16 vid, u16 flags)
vlan = br_vlan_find(nbp_vlan_group(port), vid);
if (vlan) {
+ /* Pass the flags to the hardware bridge */
+ ret = switchdev_port_obj_add(port->dev, &v.obj);
+ if (ret && ret != -EOPNOTSUPP)
+ return ret;
__vlan_add_flags(vlan, flags);
return 0;
}
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 17fd5f2cb4b8..98de6e7fd86d 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -65,8 +65,8 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param *par)
return false;
if (FWINV(info->protocol != nexthdr, EBT_IP6_PROTO))
return false;
- if (!(info->bitmask & ( EBT_IP6_DPORT |
- EBT_IP6_SPORT | EBT_IP6_ICMP6)))
+ if (!(info->bitmask & (EBT_IP6_DPORT |
+ EBT_IP6_SPORT | EBT_IP6_ICMP6)))
return true;
/* min icmpv6 headersize is 4, so sizeof(_pkthdr) is ok. */
diff --git a/net/bridge/netfilter/ebt_log.c b/net/bridge/netfilter/ebt_log.c
index 0ad639a96142..152300d164ac 100644
--- a/net/bridge/netfilter/ebt_log.c
+++ b/net/bridge/netfilter/ebt_log.c
@@ -36,14 +36,12 @@ static int ebt_log_tg_check(const struct xt_tgchk_param *par)
return 0;
}
-struct tcpudphdr
-{
+struct tcpudphdr {
__be16 src;
__be16 dst;
};
-struct arppayload
-{
+struct arppayload {
unsigned char mac_src[ETH_ALEN];
unsigned char ip_src[4];
unsigned char mac_dst[ETH_ALEN];
@@ -152,7 +150,8 @@ ebt_log_packet(struct net *net, u_int8_t pf, unsigned int hooknum,
ntohs(ah->ar_op));
/* If it's for Ethernet and the lengths are OK,
- * then log the ARP payload */
+ * then log the ARP payload
+ */
if (ah->ar_hrd == htons(1) &&
ah->ar_hln == ETH_ALEN &&
ah->ar_pln == sizeof(__be32)) {
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index 0c40570069ba..6b731e12ecfa 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -41,7 +41,7 @@ struct stp_config_pdu {
#define NR32(p) ((p[0] << 24) | (p[1] << 16) | (p[2] << 8) | p[3])
static bool ebt_filter_config(const struct ebt_stp_info *info,
- const struct stp_config_pdu *stpc)
+ const struct stp_config_pdu *stpc)
{
const struct ebt_stp_config_info *c;
uint16_t v16;
diff --git a/net/bridge/netfilter/ebt_vlan.c b/net/bridge/netfilter/ebt_vlan.c
index 618568888128..98c221dbf059 100644
--- a/net/bridge/netfilter/ebt_vlan.c
+++ b/net/bridge/netfilter/ebt_vlan.c
@@ -66,7 +66,8 @@ ebt_vlan_mt(const struct sk_buff *skb, struct xt_action_param *par)
* - Canonical Format Indicator (CFI). The Canonical Format Indicator
* (CFI) is a single bit flag value. Currently ignored.
* - VLAN Identifier (VID). The VID is encoded as
- * an unsigned binary number. */
+ * an unsigned binary number.
+ */
id = TCI & VLAN_VID_MASK;
prio = (TCI >> 13) & 0x7;
@@ -98,7 +99,8 @@ static int ebt_vlan_mt_check(const struct xt_mtchk_param *par)
}
/* Check for bitmask range
- * True if even one bit is out of mask */
+ * True if even one bit is out of mask
+ */
if (info->bitmask & ~EBT_VLAN_MASK) {
pr_debug("bitmask %2X is out of mask (%2X)\n",
info->bitmask, EBT_VLAN_MASK);
@@ -117,7 +119,8 @@ static int ebt_vlan_mt_check(const struct xt_mtchk_param *par)
* 0 - The null VLAN ID.
* 1 - The default Port VID (PVID)
* 0x0FFF - Reserved for implementation use.
- * if_vlan.h: VLAN_N_VID 4096. */
+ * if_vlan.h: VLAN_N_VID 4096.
+ */
if (GET_BITMASK(EBT_VLAN_ID)) {
if (!!info->id) { /* if id!=0 => check vid range */
if (info->id > VLAN_N_VID) {
@@ -128,7 +131,8 @@ static int ebt_vlan_mt_check(const struct xt_mtchk_param *par)
/* Note: This is valid VLAN-tagged frame point.
* Any value of user_priority are acceptable,
* but should be ignored according to 802.1Q Std.
- * So we just drop the prio flag. */
+ * So we just drop the prio flag.
+ */
info->bitmask &= ~EBT_VLAN_PRIO;
}
/* Else, id=0 (null VLAN ID) => user_priority range (any?) */
@@ -143,7 +147,8 @@ static int ebt_vlan_mt_check(const struct xt_mtchk_param *par)
}
/* Check for encapsulated proto range - it is possible to be
* any value for u_short range.
- * if_ether.h: ETH_ZLEN 60 - Min. octets in frame sans FCS */
+ * if_ether.h: ETH_ZLEN 60 - Min. octets in frame sans FCS
+ */
if (GET_BITMASK(EBT_VLAN_ENCAP)) {
if ((unsigned short) ntohs(info->encap) < ETH_ZLEN) {
pr_debug("encap frame length %d is less than "
diff --git a/net/bridge/netfilter/ebtable_filter.c b/net/bridge/netfilter/ebtable_filter.c
index 32eccd101f26..593a1bdc079e 100644
--- a/net/bridge/netfilter/ebtable_filter.c
+++ b/net/bridge/netfilter/ebtable_filter.c
@@ -12,7 +12,7 @@
#include <linux/module.h>
#define FILTER_VALID_HOOKS ((1 << NF_BR_LOCAL_IN) | (1 << NF_BR_FORWARD) | \
- (1 << NF_BR_LOCAL_OUT))
+ (1 << NF_BR_LOCAL_OUT))
static struct ebt_entries initial_chains[] = {
{
diff --git a/net/bridge/netfilter/ebtable_nat.c b/net/bridge/netfilter/ebtable_nat.c
index ec55358f00c8..eb33919821ee 100644
--- a/net/bridge/netfilter/ebtable_nat.c
+++ b/net/bridge/netfilter/ebtable_nat.c
@@ -12,7 +12,7 @@
#include <linux/module.h>
#define NAT_VALID_HOOKS ((1 << NF_BR_PRE_ROUTING) | (1 << NF_BR_LOCAL_OUT) | \
- (1 << NF_BR_POST_ROUTING))
+ (1 << NF_BR_POST_ROUTING))
static struct ebt_entries initial_chains[] = {
{
diff --git a/net/bridge/netfilter/ebtables.c b/net/bridge/netfilter/ebtables.c
index f46ca417bf2d..8570bc7744c2 100644
--- a/net/bridge/netfilter/ebtables.c
+++ b/net/bridge/netfilter/ebtables.c
@@ -35,8 +35,7 @@
"report to author: "format, ## args)
/* #define BUGPRINT(format, args...) */
-/*
- * Each cpu has its own set of counters, so there is no need for write_lock in
+/* Each cpu has its own set of counters, so there is no need for write_lock in
* the softirq
* For reading or updating the counters, the user context needs to
* get a write_lock
@@ -46,7 +45,7 @@
#define SMP_ALIGN(x) (((x) + SMP_CACHE_BYTES-1) & ~(SMP_CACHE_BYTES-1))
#define COUNTER_OFFSET(n) (SMP_ALIGN(n * sizeof(struct ebt_counter)))
#define COUNTER_BASE(c, n, cpu) ((struct ebt_counter *)(((char *)c) + \
- COUNTER_OFFSET(n) * cpu))
+ COUNTER_OFFSET(n) * cpu))
@@ -126,7 +125,7 @@ ebt_dev_check(const char *entry, const struct net_device *device)
/* process standard matches */
static inline int
ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
- const struct net_device *in, const struct net_device *out)
+ const struct net_device *in, const struct net_device *out)
{
const struct ethhdr *h = eth_hdr(skb);
const struct net_bridge_port *p;
@@ -162,7 +161,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
for (i = 0; i < 6; i++)
verdict |= (h->h_source[i] ^ e->sourcemac[i]) &
e->sourcemsk[i];
- if (FWINV2(verdict != 0, EBT_ISOURCE) )
+ if (FWINV2(verdict != 0, EBT_ISOURCE))
return 1;
}
if (e->bitmask & EBT_DESTMAC) {
@@ -170,7 +169,7 @@ ebt_basic_match(const struct ebt_entry *e, const struct sk_buff *skb,
for (i = 0; i < 6; i++)
verdict |= (h->h_dest[i] ^ e->destmac[i]) &
e->destmsk[i];
- if (FWINV2(verdict != 0, EBT_IDEST) )
+ if (FWINV2(verdict != 0, EBT_IDEST))
return 1;
}
return 0;
@@ -237,7 +236,8 @@ unsigned int ebt_do_table(struct sk_buff *skb,
(*(counter_base + i)).bcnt += skb->len;
/* these should only watch: not modify, nor tell us
- what to do with the packet */
+ * what to do with the packet
+ */
EBT_WATCHER_ITERATE(point, ebt_do_watcher, skb, &acpar);
t = (struct ebt_entry_target *)
@@ -323,7 +323,7 @@ letscontinue:
/* If it succeeds, returns element and locks mutex */
static inline void *
find_inlist_lock_noload(struct list_head *head, const char *name, int *error,
- struct mutex *mutex)
+ struct mutex *mutex)
{
struct {
struct list_head list;
@@ -342,7 +342,7 @@ find_inlist_lock_noload(struct list_head *head, const char *name, int *error,
static void *
find_inlist_lock(struct list_head *head, const char *name, const char *prefix,
- int *error, struct mutex *mutex)
+ int *error, struct mutex *mutex)
{
return try_then_request_module(
find_inlist_lock_noload(head, name, error, mutex),
@@ -451,7 +451,8 @@ static int ebt_verify_pointers(const struct ebt_replace *repl,
if (i != NF_BR_NUMHOOKS || !(e->bitmask & EBT_ENTRY_OR_ENTRIES)) {
if (e->bitmask != 0) {
/* we make userspace set this right,
- so there is no misunderstanding */
+ * so there is no misunderstanding
+ */
BUGPRINT("EBT_ENTRY_OR_ENTRIES shouldn't be set "
"in distinguisher\n");
return -EINVAL;
@@ -487,15 +488,14 @@ static int ebt_verify_pointers(const struct ebt_replace *repl,
return 0;
}
-/*
- * this one is very careful, as it is the first function
+/* this one is very careful, as it is the first function
* to parse the userspace data
*/
static inline int
ebt_check_entry_size_and_hooks(const struct ebt_entry *e,
- const struct ebt_table_info *newinfo,
- unsigned int *n, unsigned int *cnt,
- unsigned int *totalcnt, unsigned int *udc_cnt)
+ const struct ebt_table_info *newinfo,
+ unsigned int *n, unsigned int *cnt,
+ unsigned int *totalcnt, unsigned int *udc_cnt)
{
int i;
@@ -504,10 +504,12 @@ ebt_check_entry_size_and_hooks(const struct ebt_entry *e,
break;
}
/* beginning of a new chain
- if i == NF_BR_NUMHOOKS it must be a user defined chain */
+ * if i == NF_BR_NUMHOOKS it must be a user defined chain
+ */
if (i != NF_BR_NUMHOOKS || !e->bitmask) {
/* this checks if the previous chain has as many entries
- as it said it has */
+ * as it said it has
+ */
if (*n != *cnt) {
BUGPRINT("nentries does not equal the nr of entries "
"in the chain\n");
@@ -549,20 +551,18 @@ ebt_check_entry_size_and_hooks(const struct ebt_entry *e,
return 0;
}
-struct ebt_cl_stack
-{
+struct ebt_cl_stack {
struct ebt_chainstack cs;
int from;
unsigned int hookmask;
};
-/*
- * we need these positions to check that the jumps to a different part of the
+/* We need these positions to check that the jumps to a different part of the
* entries is a jump to the beginning of a new chain.
*/
static inline int
ebt_get_udc_positions(struct ebt_entry *e, struct ebt_table_info *newinfo,
- unsigned int *n, struct ebt_cl_stack *udc)
+ unsigned int *n, struct ebt_cl_stack *udc)
{
int i;
@@ -649,9 +649,9 @@ ebt_cleanup_entry(struct ebt_entry *e, struct net *net, unsigned int *cnt)
static inline int
ebt_check_entry(struct ebt_entry *e, struct net *net,
- const struct ebt_table_info *newinfo,
- const char *name, unsigned int *cnt,
- struct ebt_cl_stack *cl_s, unsigned int udc_cnt)
+ const struct ebt_table_info *newinfo,
+ const char *name, unsigned int *cnt,
+ struct ebt_cl_stack *cl_s, unsigned int udc_cnt)
{
struct ebt_entry_target *t;
struct xt_target *target;
@@ -673,7 +673,7 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
BUGPRINT("Unknown flag for inv bitmask\n");
return -EINVAL;
}
- if ( (e->bitmask & EBT_NOPROTO) && (e->bitmask & EBT_802_3) ) {
+ if ((e->bitmask & EBT_NOPROTO) && (e->bitmask & EBT_802_3)) {
BUGPRINT("NOPROTO & 802_3 not allowed\n");
return -EINVAL;
}
@@ -687,7 +687,8 @@ ebt_check_entry(struct ebt_entry *e, struct net *net,
break;
}
/* (1 << NF_BR_NUMHOOKS) tells the check functions the rule is on
- a base chain */
+ * a base chain
+ */
if (i < NF_BR_NUMHOOKS)
hookmask = (1 << hook) | (1 << NF_BR_NUMHOOKS);
else {
@@ -758,13 +759,12 @@ cleanup_matches:
return ret;
}
-/*
- * checks for loops and sets the hook mask for udc
+/* checks for loops and sets the hook mask for udc
* the hook mask for udc tells us from which base chains the udc can be
* accessed. This mask is a parameter to the check() functions of the extensions
*/
static int check_chainloops(const struct ebt_entries *chain, struct ebt_cl_stack *cl_s,
- unsigned int udc_cnt, unsigned int hooknr, char *base)
+ unsigned int udc_cnt, unsigned int hooknr, char *base)
{
int i, chain_nr = -1, pos = 0, nentries = chain->nentries, verdict;
const struct ebt_entry *e = (struct ebt_entry *)chain->data;
@@ -853,7 +853,8 @@ static int translate_table(struct net *net, const char *name,
return -EINVAL;
}
/* make sure chains are ordered after each other in same order
- as their corresponding hooks */
+ * as their corresponding hooks
+ */
for (j = i + 1; j < NF_BR_NUMHOOKS; j++) {
if (!newinfo->hook_entry[j])
continue;
@@ -868,7 +869,8 @@ static int translate_table(struct net *net, const char *name,
i = 0; /* holds the expected nr. of entries for the chain */
j = 0; /* holds the up to now counted entries for the chain */
k = 0; /* holds the total nr. of entries, should equal
- newinfo->nentries afterwards */
+ * newinfo->nentries afterwards
+ */
udc_cnt = 0; /* will hold the nr. of user defined chains (udc) */
ret = EBT_ENTRY_ITERATE(newinfo->entries, newinfo->entries_size,
ebt_check_entry_size_and_hooks, newinfo,
@@ -888,10 +890,12 @@ static int translate_table(struct net *net, const char *name,
}
/* get the location of the udc, put them in an array
- while we're at it, allocate the chainstack */
+ * while we're at it, allocate the chainstack
+ */
if (udc_cnt) {
/* this will get free'd in do_replace()/ebt_register_table()
- if an error occurs */
+ * if an error occurs
+ */
newinfo->chainstack =
vmalloc(nr_cpu_ids * sizeof(*(newinfo->chainstack)));
if (!newinfo->chainstack)
@@ -932,14 +936,15 @@ static int translate_table(struct net *net, const char *name,
}
/* we now know the following (along with E=mc²):
- - the nr of entries in each chain is right
- - the size of the allocated space is right
- - all valid hooks have a corresponding chain
- - there are no loops
- - wrong data can still be on the level of a single entry
- - could be there are jumps to places that are not the
- beginning of a chain. This can only occur in chains that
- are not accessible from any base chains, so we don't care. */
+ * - the nr of entries in each chain is right
+ * - the size of the allocated space is right
+ * - all valid hooks have a corresponding chain
+ * - there are no loops
+ * - wrong data can still be on the level of a single entry
+ * - could be there are jumps to places that are not the
+ * beginning of a chain. This can only occur in chains that
+ * are not accessible from any base chains, so we don't care.
+ */
/* used to know what we need to clean up if something goes wrong */
i = 0;
@@ -955,7 +960,7 @@ static int translate_table(struct net *net, const char *name,
/* called under write_lock */
static void get_counters(const struct ebt_counter *oldcounters,
- struct ebt_counter *counters, unsigned int nentries)
+ struct ebt_counter *counters, unsigned int nentries)
{
int i, cpu;
struct ebt_counter *counter_base;
@@ -986,7 +991,8 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
struct ebt_table *t;
/* the user wants counters back
- the check on the size is done later, when we have the lock */
+ * the check on the size is done later, when we have the lock
+ */
if (repl->num_counters) {
unsigned long size = repl->num_counters * sizeof(*counterstmp);
counterstmp = vmalloc(size);
@@ -1038,9 +1044,10 @@ static int do_replace_finish(struct net *net, struct ebt_replace *repl,
write_unlock_bh(&t->lock);
mutex_unlock(&ebt_mutex);
/* so, a user can change the chains while having messed up her counter
- allocation. Only reason why this is done is because this way the lock
- is held only once, while this doesn't bring the kernel into a
- dangerous state. */
+ * allocation. Only reason why this is done is because this way the lock
+ * is held only once, while this doesn't bring the kernel into a
+ * dangerous state.
+ */
if (repl->num_counters &&
copy_to_user(repl->counters, counterstmp,
repl->num_counters * sizeof(struct ebt_counter))) {
@@ -1342,13 +1349,14 @@ static int update_counters(struct net *net, const void __user *user,
}
static inline int ebt_make_matchname(const struct ebt_entry_match *m,
- const char *base, char __user *ubase)
+ const char *base, char __user *ubase)
{
char __user *hlp = ubase + ((char *)m - base);
char name[EBT_FUNCTION_MAXNAMELEN] = {};
/* ebtables expects 32 bytes long names but xt_match names are 29 bytes
- long. Copy 29 bytes and fill remaining bytes with zeroes. */
+ * long. Copy 29 bytes and fill remaining bytes with zeroes.
+ */
strlcpy(name, m->u.match->name, sizeof(name));
if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
return -EFAULT;
@@ -1356,19 +1364,19 @@ static inline int ebt_make_matchname(const struct ebt_entry_match *m,
}
static inline int ebt_make_watchername(const struct ebt_entry_watcher *w,
- const char *base, char __user *ubase)
+ const char *base, char __user *ubase)
{
char __user *hlp = ubase + ((char *)w - base);
char name[EBT_FUNCTION_MAXNAMELEN] = {};
strlcpy(name, w->u.watcher->name, sizeof(name));
- if (copy_to_user(hlp , name, EBT_FUNCTION_MAXNAMELEN))
+ if (copy_to_user(hlp, name, EBT_FUNCTION_MAXNAMELEN))
return -EFAULT;
return 0;
}
-static inline int
-ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase)
+static inline int ebt_make_names(struct ebt_entry *e, const char *base,
+ char __user *ubase)
{
int ret;
char __user *hlp;
@@ -1394,9 +1402,9 @@ ebt_make_names(struct ebt_entry *e, const char *base, char __user *ubase)
}
static int copy_counters_to_user(struct ebt_table *t,
- const struct ebt_counter *oldcounters,
- void __user *user, unsigned int num_counters,
- unsigned int nentries)
+ const struct ebt_counter *oldcounters,
+ void __user *user, unsigned int num_counters,
+ unsigned int nentries)
{
struct ebt_counter *counterstmp;
int ret = 0;
@@ -1427,7 +1435,7 @@ static int copy_counters_to_user(struct ebt_table *t,
/* called with ebt_mutex locked */
static int copy_everything_to_user(struct ebt_table *t, void __user *user,
- const int *len, int cmd)
+ const int *len, int cmd)
{
struct ebt_replace tmp;
const struct ebt_counter *oldcounters;
@@ -1513,6 +1521,8 @@ static int do_ebt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
if (copy_from_user(&tmp, user, sizeof(tmp)))
return -EFAULT;
+ tmp.name[sizeof(tmp.name) - 1] = '\0';
+
t = find_table_lock(net, tmp.name, &ret, &ebt_mutex);
if (!t)
return ret;
@@ -1595,8 +1605,7 @@ static int ebt_compat_entry_padsize(void)
static int ebt_compat_match_offset(const struct xt_match *match,
unsigned int userlen)
{
- /*
- * ebt_among needs special handling. The kernel .matchsize is
+ /* ebt_among needs special handling. The kernel .matchsize is
* set to -1 at registration time; at runtime an EBT_ALIGN()ed
* value is expected.
* Example: userspace sends 4500, ebt_among.c wants 4504.
@@ -1966,8 +1975,7 @@ static int compat_mtw_from_user(struct compat_ebt_entry_mwt *mwt,
return off + match_size;
}
-/*
- * return size of all matches, watchers or target, including necessary
+/* return size of all matches, watchers or target, including necessary
* alignment and padding.
*/
static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32,
@@ -2070,8 +2078,7 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base,
if (ret < 0)
return ret;
buf_start = (char *) entry;
- /*
- * 0: matches offset, always follows ebt_entry.
+ /* 0: matches offset, always follows ebt_entry.
* 1: watchers offset, from ebt_entry structure
* 2: target offset, from ebt_entry structure
* 3: next ebt_entry offset, from ebt_entry structure
@@ -2115,8 +2122,7 @@ static int size_entry_mwt(struct ebt_entry *entry, const unsigned char *base,
return 0;
}
-/*
- * repl->entries_size is the size of the ebt_entry blob in userspace.
+/* repl->entries_size is the size of the ebt_entry blob in userspace.
* It might need more memory when copied to a 64 bit kernel in case
* userspace is 32-bit. So, first task: find out how much memory is needed.
*
@@ -2305,7 +2311,7 @@ static int compat_do_ebt_set_ctl(struct sock *sk,
break;
default:
ret = -EINVAL;
- }
+ }
return ret;
}
@@ -2328,6 +2334,8 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd,
if (copy_from_user(&tmp, user, sizeof(tmp)))
return -EFAULT;
+ tmp.name[sizeof(tmp.name) - 1] = '\0';
+
t = find_table_lock(net, tmp.name, &ret, &ebt_mutex);
if (!t)
return ret;
@@ -2360,8 +2368,7 @@ static int compat_do_ebt_get_ctl(struct sock *sk, int cmd,
break;
case EBT_SO_GET_ENTRIES:
case EBT_SO_GET_INIT_ENTRIES:
- /*
- * try real handler first in case of userland-side padding.
+ /* try real handler first in case of userland-side padding.
* in case we are dealing with an 'ordinary' 32 bit binary
* without 64bit compatibility padding, this will fail right
* after copy_from_user when the *len argument is validated.
diff --git a/net/bridge/netfilter/nf_tables_bridge.c b/net/bridge/netfilter/nf_tables_bridge.c
index 62f6b1b19589..7fcdd7261d88 100644
--- a/net/bridge/netfilter/nf_tables_bridge.c
+++ b/net/bridge/netfilter/nf_tables_bridge.c
@@ -141,7 +141,7 @@ err:
static void nf_tables_bridge_exit_net(struct net *net)
{
- nft_unregister_afinfo(net->nft.bridge);
+ nft_unregister_afinfo(net, net->nft.bridge);
kfree(net->nft.bridge);
}
diff --git a/net/bridge/netfilter/nft_meta_bridge.c b/net/bridge/netfilter/nft_meta_bridge.c
index a21269b83f16..4b901d9f2e7c 100644
--- a/net/bridge/netfilter/nft_meta_bridge.c
+++ b/net/bridge/netfilter/nft_meta_bridge.c
@@ -84,6 +84,7 @@ static const struct nft_expr_ops nft_meta_bridge_set_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
.eval = nft_meta_set_eval,
.init = nft_meta_set_init,
+ .destroy = nft_meta_set_destroy,
.dump = nft_meta_set_dump,
};
diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c
index fdba3d9fbff3..77f7e7a9ebe1 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -40,7 +40,8 @@ static void nft_reject_br_push_etherhdr(struct sk_buff *oldskb,
/* We cannot use oldskb->dev, it can be either bridge device (NF_BRIDGE INPUT)
* or the bridge port (NF_BRIDGE PREROUTING).
*/
-static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb,
+static void nft_reject_br_send_v4_tcp_reset(struct net *net,
+ struct sk_buff *oldskb,
const struct net_device *dev,
int hook)
{
@@ -63,9 +64,9 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb,
skb_reserve(nskb, LL_MAX_HEADER);
niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_TCP,
- sysctl_ip_default_ttl);
+ net->ipv4.sysctl_ip_default_ttl);
nf_reject_ip_tcphdr_put(nskb, oldskb, oth);
- niph->ttl = sysctl_ip_default_ttl;
+ niph->ttl = net->ipv4.sysctl_ip_default_ttl;
niph->tot_len = htons(nskb->len);
ip_send_check(niph);
@@ -74,7 +75,8 @@ static void nft_reject_br_send_v4_tcp_reset(struct sk_buff *oldskb,
br_deliver(br_port_get_rcu(dev), nskb);
}
-static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb,
+static void nft_reject_br_send_v4_unreach(struct net *net,
+ struct sk_buff *oldskb,
const struct net_device *dev,
int hook, u8 code)
{
@@ -119,7 +121,7 @@ static void nft_reject_br_send_v4_unreach(struct sk_buff *oldskb,
skb_reserve(nskb, LL_MAX_HEADER);
niph = nf_reject_iphdr_put(nskb, oldskb, IPPROTO_ICMP,
- sysctl_ip_default_ttl);
+ net->ipv4.sysctl_ip_default_ttl);
skb_reset_transport_header(nskb);
icmph = (struct icmphdr *)skb_put(nskb, sizeof(struct icmphdr));
@@ -271,17 +273,17 @@ static void nft_reject_bridge_eval(const struct nft_expr *expr,
case htons(ETH_P_IP):
switch (priv->type) {
case NFT_REJECT_ICMP_UNREACH:
- nft_reject_br_send_v4_unreach(pkt->skb, pkt->in,
- pkt->hook,
+ nft_reject_br_send_v4_unreach(pkt->net, pkt->skb,
+ pkt->in, pkt->hook,
priv->icmp_code);
break;
case NFT_REJECT_TCP_RST:
- nft_reject_br_send_v4_tcp_reset(pkt->skb, pkt->in,
- pkt->hook);
+ nft_reject_br_send_v4_tcp_reset(pkt->net, pkt->skb,
+ pkt->in, pkt->hook);
break;
case NFT_REJECT_ICMPX_UNREACH:
- nft_reject_br_send_v4_unreach(pkt->skb, pkt->in,
- pkt->hook,
+ nft_reject_br_send_v4_unreach(pkt->net, pkt->skb,
+ pkt->in, pkt->hook,
nft_reject_icmp_code(priv->icmp_code));
break;
}
diff --git a/net/caif/caif_socket.c b/net/caif/caif_socket.c
index cc858919108e..aa209b1066c9 100644
--- a/net/caif/caif_socket.c
+++ b/net/caif/caif_socket.c
@@ -323,7 +323,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo)
!timeo)
break;
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
release_sock(sk);
timeo = schedule_timeout(timeo);
lock_sock(sk);
@@ -331,7 +331,7 @@ static long caif_stream_data_wait(struct sock *sk, long timeo)
if (sock_flag(sk, SOCK_DEAD))
break;
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
}
finish_wait(sk_sleep(sk), &wait);
diff --git a/net/caif/cfpkt_skbuff.c b/net/caif/cfpkt_skbuff.c
index f6c3b2137eea..59ce1fcc220c 100644
--- a/net/caif/cfpkt_skbuff.c
+++ b/net/caif/cfpkt_skbuff.c
@@ -286,7 +286,7 @@ int cfpkt_setlen(struct cfpkt *pkt, u16 len)
else
skb_trim(skb, len);
- return cfpkt_getlen(pkt);
+ return cfpkt_getlen(pkt);
}
/* Need to expand SKB */
diff --git a/net/caif/cfrfml.c b/net/caif/cfrfml.c
index 61d7617d9249..b82440e1fcb4 100644
--- a/net/caif/cfrfml.c
+++ b/net/caif/cfrfml.c
@@ -159,7 +159,7 @@ static int cfrfml_receive(struct cflayer *layr, struct cfpkt *pkt)
tmppkt = NULL;
/* Verify that length is correct */
- err = EPROTO;
+ err = -EPROTO;
if (rfml->pdu_size != cfpkt_getlen(pkt) - RFM_HEAD_SIZE + 1)
goto out;
}
diff --git a/net/ceph/auth_x.c b/net/ceph/auth_x.c
index ba6eb17226da..9e43a315e662 100644
--- a/net/ceph/auth_x.c
+++ b/net/ceph/auth_x.c
@@ -8,6 +8,7 @@
#include <linux/ceph/decode.h>
#include <linux/ceph/auth.h>
+#include <linux/ceph/libceph.h>
#include <linux/ceph/messenger.h>
#include "crypto.h"
@@ -151,7 +152,6 @@ static int process_one_ticket(struct ceph_auth_client *ac,
void *ticket_buf = NULL;
void *tp, *tpend;
void **ptp;
- struct ceph_timespec new_validity;
struct ceph_crypto_key new_session_key;
struct ceph_buffer *new_ticket_blob;
unsigned long new_expires, new_renew_after;
@@ -192,8 +192,8 @@ static int process_one_ticket(struct ceph_auth_client *ac,
if (ret)
goto out;
- ceph_decode_copy(&dp, &new_validity, sizeof(new_validity));
- ceph_decode_timespec(&validity, &new_validity);
+ ceph_decode_timespec(&validity, dp);
+ dp += sizeof(struct ceph_timespec);
new_expires = get_seconds() + validity.tv_sec;
new_renew_after = new_expires - (validity.tv_sec / 4);
dout(" expires=%lu renew_after=%lu\n", new_expires,
@@ -232,10 +232,10 @@ static int process_one_ticket(struct ceph_auth_client *ac,
ceph_buffer_put(th->ticket_blob);
th->session_key = new_session_key;
th->ticket_blob = new_ticket_blob;
- th->validity = new_validity;
th->secret_id = new_secret_id;
th->expires = new_expires;
th->renew_after = new_renew_after;
+ th->have_key = true;
dout(" got ticket service %d (%s) secret_id %lld len %d\n",
type, ceph_entity_type_name(type), th->secret_id,
(int)th->ticket_blob->vec.iov_len);
@@ -279,6 +279,15 @@ bad:
return -EINVAL;
}
+static void ceph_x_authorizer_cleanup(struct ceph_x_authorizer *au)
+{
+ ceph_crypto_key_destroy(&au->session_key);
+ if (au->buf) {
+ ceph_buffer_put(au->buf);
+ au->buf = NULL;
+ }
+}
+
static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
struct ceph_x_ticket_handler *th,
struct ceph_x_authorizer *au)
@@ -297,7 +306,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
ceph_crypto_key_destroy(&au->session_key);
ret = ceph_crypto_key_clone(&au->session_key, &th->session_key);
if (ret)
- return ret;
+ goto out_au;
maxlen = sizeof(*msg_a) + sizeof(msg_b) +
ceph_x_encrypt_buflen(ticket_blob_len);
@@ -309,8 +318,8 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
if (!au->buf) {
au->buf = ceph_buffer_new(maxlen, GFP_NOFS);
if (!au->buf) {
- ceph_crypto_key_destroy(&au->session_key);
- return -ENOMEM;
+ ret = -ENOMEM;
+ goto out_au;
}
}
au->service = th->service;
@@ -340,7 +349,7 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
ret = ceph_x_encrypt(&au->session_key, &msg_b, sizeof(msg_b),
p, end - p);
if (ret < 0)
- goto out_buf;
+ goto out_au;
p += ret;
au->buf->vec.iov_len = p - au->buf->vec.iov_base;
dout(" built authorizer nonce %llx len %d\n", au->nonce,
@@ -348,9 +357,8 @@ static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
BUG_ON(au->buf->vec.iov_len > maxlen);
return 0;
-out_buf:
- ceph_buffer_put(au->buf);
- au->buf = NULL;
+out_au:
+ ceph_x_authorizer_cleanup(au);
return ret;
}
@@ -375,6 +383,24 @@ bad:
return -ERANGE;
}
+static bool need_key(struct ceph_x_ticket_handler *th)
+{
+ if (!th->have_key)
+ return true;
+
+ return get_seconds() >= th->renew_after;
+}
+
+static bool have_key(struct ceph_x_ticket_handler *th)
+{
+ if (th->have_key) {
+ if (get_seconds() >= th->expires)
+ th->have_key = false;
+ }
+
+ return th->have_key;
+}
+
static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
{
int want = ac->want_keys;
@@ -393,20 +419,18 @@ static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
continue;
th = get_ticket_handler(ac, service);
-
if (IS_ERR(th)) {
*pneed |= service;
continue;
}
- if (get_seconds() >= th->renew_after)
+ if (need_key(th))
*pneed |= service;
- if (get_seconds() >= th->expires)
+ if (!have_key(th))
xi->have_keys &= ~service;
}
}
-
static int ceph_x_build_request(struct ceph_auth_client *ac,
void *buf, void *end)
{
@@ -624,8 +648,7 @@ static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
{
struct ceph_x_authorizer *au = (void *)a;
- ceph_crypto_key_destroy(&au->session_key);
- ceph_buffer_put(au->buf);
+ ceph_x_authorizer_cleanup(au);
kfree(au);
}
@@ -653,21 +676,32 @@ static void ceph_x_destroy(struct ceph_auth_client *ac)
remove_ticket_handler(ac, th);
}
- if (xi->auth_authorizer.buf)
- ceph_buffer_put(xi->auth_authorizer.buf);
+ ceph_x_authorizer_cleanup(&xi->auth_authorizer);
kfree(ac->private);
ac->private = NULL;
}
-static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
- int peer_type)
+static void invalidate_ticket(struct ceph_auth_client *ac, int peer_type)
{
struct ceph_x_ticket_handler *th;
th = get_ticket_handler(ac, peer_type);
if (!IS_ERR(th))
- memset(&th->validity, 0, sizeof(th->validity));
+ th->have_key = false;
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+ int peer_type)
+{
+ /*
+ * We are to invalidate a service ticket in the hopes of
+ * getting a new, hopefully more valid, one. But, we won't get
+ * it unless our AUTH ticket is good, so invalidate AUTH ticket
+ * as well, just in case.
+ */
+ invalidate_ticket(ac, peer_type);
+ invalidate_ticket(ac, CEPH_ENTITY_TYPE_AUTH);
}
static int calcu_signature(struct ceph_x_authorizer *au,
@@ -691,8 +725,10 @@ static int ceph_x_sign_message(struct ceph_auth_handshake *auth,
struct ceph_msg *msg)
{
int ret;
- if (!auth->authorizer)
+
+ if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN))
return 0;
+
ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer,
msg, &msg->footer.sig);
if (ret < 0)
@@ -707,8 +743,9 @@ static int ceph_x_check_message_signature(struct ceph_auth_handshake *auth,
__le64 sig_check;
int ret;
- if (!auth->authorizer)
+ if (ceph_test_opt(from_msgr(msg->con->msgr), NOMSGSIGN))
return 0;
+
ret = calcu_signature((struct ceph_x_authorizer *)auth->authorizer,
msg, &sig_check);
if (ret < 0)
diff --git a/net/ceph/auth_x.h b/net/ceph/auth_x.h
index e8b7c6917d47..40b1a3cf7397 100644
--- a/net/ceph/auth_x.h
+++ b/net/ceph/auth_x.h
@@ -16,7 +16,7 @@ struct ceph_x_ticket_handler {
unsigned int service;
struct ceph_crypto_key session_key;
- struct ceph_timespec validity;
+ bool have_key;
u64 secret_id;
struct ceph_buffer *ticket_blob;
diff --git a/net/ceph/ceph_common.c b/net/ceph/ceph_common.c
index 78f098a20796..dcc18c6f7cf9 100644
--- a/net/ceph/ceph_common.c
+++ b/net/ceph/ceph_common.c
@@ -245,6 +245,8 @@ enum {
Opt_nocrc,
Opt_cephx_require_signatures,
Opt_nocephx_require_signatures,
+ Opt_cephx_sign_messages,
+ Opt_nocephx_sign_messages,
Opt_tcp_nodelay,
Opt_notcp_nodelay,
};
@@ -267,6 +269,8 @@ static match_table_t opt_tokens = {
{Opt_nocrc, "nocrc"},
{Opt_cephx_require_signatures, "cephx_require_signatures"},
{Opt_nocephx_require_signatures, "nocephx_require_signatures"},
+ {Opt_cephx_sign_messages, "cephx_sign_messages"},
+ {Opt_nocephx_sign_messages, "nocephx_sign_messages"},
{Opt_tcp_nodelay, "tcp_nodelay"},
{Opt_notcp_nodelay, "notcp_nodelay"},
{-1, NULL}
@@ -357,7 +361,6 @@ ceph_parse_options(char *options, const char *dev_name,
opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
- opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT;
/* get mon ip(s) */
/* ip1[:port1][,ip2[:port2]...] */
@@ -491,6 +494,12 @@ ceph_parse_options(char *options, const char *dev_name,
case Opt_nocephx_require_signatures:
opt->flags |= CEPH_OPT_NOMSGAUTH;
break;
+ case Opt_cephx_sign_messages:
+ opt->flags &= ~CEPH_OPT_NOMSGSIGN;
+ break;
+ case Opt_nocephx_sign_messages:
+ opt->flags |= CEPH_OPT_NOMSGSIGN;
+ break;
case Opt_tcp_nodelay:
opt->flags |= CEPH_OPT_TCP_NODELAY;
@@ -534,6 +543,8 @@ int ceph_print_client_options(struct seq_file *m, struct ceph_client *client)
seq_puts(m, "nocrc,");
if (opt->flags & CEPH_OPT_NOMSGAUTH)
seq_puts(m, "nocephx_require_signatures,");
+ if (opt->flags & CEPH_OPT_NOMSGSIGN)
+ seq_puts(m, "nocephx_sign_messages,");
if ((opt->flags & CEPH_OPT_TCP_NODELAY) == 0)
seq_puts(m, "notcp_nodelay,");
@@ -596,11 +607,7 @@ struct ceph_client *ceph_create_client(struct ceph_options *opt, void *private,
if (ceph_test_opt(client, MYIP))
myaddr = &client->options->my_addr;
- ceph_messenger_init(&client->msgr, myaddr,
- client->supported_features,
- client->required_features,
- ceph_test_opt(client, NOCRC),
- ceph_test_opt(client, TCP_NODELAY));
+ ceph_messenger_init(&client->msgr, myaddr);
/* subsystems */
err = ceph_monc_init(&client->monc, client);
@@ -678,6 +685,9 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
return client->auth_err;
}
+ pr_info("client%llu fsid %pU\n", ceph_client_id(client), &client->fsid);
+ ceph_debugfs_client_init(client);
+
return 0;
}
EXPORT_SYMBOL(__ceph_open_session);
diff --git a/net/ceph/crush/mapper.c b/net/ceph/crush/mapper.c
index 393bfb22d5bb..5fcfb98f309e 100644
--- a/net/ceph/crush/mapper.c
+++ b/net/ceph/crush/mapper.c
@@ -403,6 +403,7 @@ static int is_out(const struct crush_map *map,
* @local_retries: localized retries
* @local_fallback_retries: localized fallback retries
* @recurse_to_leaf: true if we want one device under each item of given type (chooseleaf instead of choose)
+ * @stable: stable mode starts rep=0 in the recursive call for all replicas
* @vary_r: pass r to recursive calls
* @out2: second output vector for leaf items (if @recurse_to_leaf)
* @parent_r: r value passed from the parent
@@ -419,6 +420,7 @@ static int crush_choose_firstn(const struct crush_map *map,
unsigned int local_fallback_retries,
int recurse_to_leaf,
unsigned int vary_r,
+ unsigned int stable,
int *out2,
int parent_r)
{
@@ -433,13 +435,13 @@ static int crush_choose_firstn(const struct crush_map *map,
int collide, reject;
int count = out_size;
- dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d\n",
+ dprintk("CHOOSE%s bucket %d x %d outpos %d numrep %d tries %d recurse_tries %d local_retries %d local_fallback_retries %d parent_r %d stable %d\n",
recurse_to_leaf ? "_LEAF" : "",
bucket->id, x, outpos, numrep,
tries, recurse_tries, local_retries, local_fallback_retries,
- parent_r);
+ parent_r, stable);
- for (rep = outpos; rep < numrep && count > 0 ; rep++) {
+ for (rep = stable ? 0 : outpos; rep < numrep && count > 0 ; rep++) {
/* keep trying until we get a non-out, non-colliding item */
ftotal = 0;
skip_rep = 0;
@@ -512,13 +514,14 @@ static int crush_choose_firstn(const struct crush_map *map,
if (crush_choose_firstn(map,
map->buckets[-1-item],
weight, weight_max,
- x, outpos+1, 0,
+ x, stable ? 1 : outpos+1, 0,
out2, outpos, count,
recurse_tries, 0,
local_retries,
local_fallback_retries,
0,
vary_r,
+ stable,
NULL,
sub_r) <= outpos)
/* didn't get leaf */
@@ -816,6 +819,7 @@ int crush_do_rule(const struct crush_map *map,
int choose_local_fallback_retries = map->choose_local_fallback_tries;
int vary_r = map->chooseleaf_vary_r;
+ int stable = map->chooseleaf_stable;
if ((__u32)ruleno >= map->max_rules) {
dprintk(" bad ruleno %d\n", ruleno);
@@ -835,7 +839,8 @@ int crush_do_rule(const struct crush_map *map,
case CRUSH_RULE_TAKE:
if ((curstep->arg1 >= 0 &&
curstep->arg1 < map->max_devices) ||
- (-1-curstep->arg1 < map->max_buckets &&
+ (-1-curstep->arg1 >= 0 &&
+ -1-curstep->arg1 < map->max_buckets &&
map->buckets[-1-curstep->arg1])) {
w[0] = curstep->arg1;
wsize = 1;
@@ -869,6 +874,11 @@ int crush_do_rule(const struct crush_map *map,
vary_r = curstep->arg1;
break;
+ case CRUSH_RULE_SET_CHOOSELEAF_STABLE:
+ if (curstep->arg1 >= 0)
+ stable = curstep->arg1;
+ break;
+
case CRUSH_RULE_CHOOSELEAF_FIRSTN:
case CRUSH_RULE_CHOOSE_FIRSTN:
firstn = 1;
@@ -888,6 +898,7 @@ int crush_do_rule(const struct crush_map *map,
osize = 0;
for (i = 0; i < wsize; i++) {
+ int bno;
/*
* see CRUSH_N, CRUSH_N_MINUS macros.
* basically, numrep <= 0 means relative to
@@ -900,6 +911,13 @@ int crush_do_rule(const struct crush_map *map,
continue;
}
j = 0;
+ /* make sure bucket id is valid */
+ bno = -1 - w[i];
+ if (bno < 0 || bno >= map->max_buckets) {
+ /* w[i] is probably CRUSH_ITEM_NONE */
+ dprintk(" bad w[i] %d\n", w[i]);
+ continue;
+ }
if (firstn) {
int recurse_tries;
if (choose_leaf_tries)
@@ -911,7 +929,7 @@ int crush_do_rule(const struct crush_map *map,
recurse_tries = choose_tries;
osize += crush_choose_firstn(
map,
- map->buckets[-1-w[i]],
+ map->buckets[bno],
weight, weight_max,
x, numrep,
curstep->arg2,
@@ -923,6 +941,7 @@ int crush_do_rule(const struct crush_map *map,
choose_local_fallback_retries,
recurse_to_leaf,
vary_r,
+ stable,
c+osize,
0);
} else {
@@ -930,7 +949,7 @@ int crush_do_rule(const struct crush_map *map,
numrep : (result_max-osize));
crush_choose_indep(
map,
- map->buckets[-1-w[i]],
+ map->buckets[bno],
weight, weight_max,
x, out_size, numrep,
curstep->arg2,
diff --git a/net/ceph/crypto.c b/net/ceph/crypto.c
index 42e8649c6e79..db2847ac5f12 100644
--- a/net/ceph/crypto.c
+++ b/net/ceph/crypto.c
@@ -4,7 +4,8 @@
#include <linux/err.h>
#include <linux/scatterlist.h>
#include <linux/slab.h>
-#include <crypto/hash.h>
+#include <crypto/aes.h>
+#include <crypto/skcipher.h>
#include <linux/key-type.h>
#include <keys/ceph-type.h>
@@ -79,9 +80,9 @@ int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
return 0;
}
-static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
+static struct crypto_skcipher *ceph_crypto_alloc_cipher(void)
{
- return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+ return crypto_alloc_skcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
}
static const u8 *aes_iv = (u8 *)CEPH_AES_IV;
@@ -162,11 +163,10 @@ static int ceph_aes_encrypt(const void *key, int key_len,
{
struct scatterlist sg_in[2], prealloc_sg;
struct sg_table sg_out;
- struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
- struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+ struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
+ SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
- void *iv;
- int ivsize;
+ char iv[AES_BLOCK_SIZE];
size_t zero_padding = (0x10 - (src_len & 0x0f));
char pad[16];
@@ -184,10 +184,13 @@ static int ceph_aes_encrypt(const void *key, int key_len,
if (ret)
goto out_tfm;
- crypto_blkcipher_setkey((void *)tfm, key, key_len);
- iv = crypto_blkcipher_crt(tfm)->iv;
- ivsize = crypto_blkcipher_ivsize(tfm);
- memcpy(iv, aes_iv, ivsize);
+ crypto_skcipher_setkey((void *)tfm, key, key_len);
+ memcpy(iv, aes_iv, AES_BLOCK_SIZE);
+
+ skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, sg_in, sg_out.sgl,
+ src_len + zero_padding, iv);
/*
print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
@@ -197,8 +200,8 @@ static int ceph_aes_encrypt(const void *key, int key_len,
print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
pad, zero_padding, 1);
*/
- ret = crypto_blkcipher_encrypt(&desc, sg_out.sgl, sg_in,
- src_len + zero_padding);
+ ret = crypto_skcipher_encrypt(req);
+ skcipher_request_zero(req);
if (ret < 0) {
pr_err("ceph_aes_crypt failed %d\n", ret);
goto out_sg;
@@ -211,7 +214,7 @@ static int ceph_aes_encrypt(const void *key, int key_len,
out_sg:
teardown_sgtable(&sg_out);
out_tfm:
- crypto_free_blkcipher(tfm);
+ crypto_free_skcipher(tfm);
return ret;
}
@@ -222,11 +225,10 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
{
struct scatterlist sg_in[3], prealloc_sg;
struct sg_table sg_out;
- struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
- struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+ struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
+ SKCIPHER_REQUEST_ON_STACK(req, tfm);
int ret;
- void *iv;
- int ivsize;
+ char iv[AES_BLOCK_SIZE];
size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
char pad[16];
@@ -245,10 +247,13 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
if (ret)
goto out_tfm;
- crypto_blkcipher_setkey((void *)tfm, key, key_len);
- iv = crypto_blkcipher_crt(tfm)->iv;
- ivsize = crypto_blkcipher_ivsize(tfm);
- memcpy(iv, aes_iv, ivsize);
+ crypto_skcipher_setkey((void *)tfm, key, key_len);
+ memcpy(iv, aes_iv, AES_BLOCK_SIZE);
+
+ skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, sg_in, sg_out.sgl,
+ src1_len + src2_len + zero_padding, iv);
/*
print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
@@ -260,8 +265,8 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
pad, zero_padding, 1);
*/
- ret = crypto_blkcipher_encrypt(&desc, sg_out.sgl, sg_in,
- src1_len + src2_len + zero_padding);
+ ret = crypto_skcipher_encrypt(req);
+ skcipher_request_zero(req);
if (ret < 0) {
pr_err("ceph_aes_crypt2 failed %d\n", ret);
goto out_sg;
@@ -274,7 +279,7 @@ static int ceph_aes_encrypt2(const void *key, int key_len, void *dst,
out_sg:
teardown_sgtable(&sg_out);
out_tfm:
- crypto_free_blkcipher(tfm);
+ crypto_free_skcipher(tfm);
return ret;
}
@@ -284,11 +289,10 @@ static int ceph_aes_decrypt(const void *key, int key_len,
{
struct sg_table sg_in;
struct scatterlist sg_out[2], prealloc_sg;
- struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
- struct blkcipher_desc desc = { .tfm = tfm };
+ struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
+ SKCIPHER_REQUEST_ON_STACK(req, tfm);
char pad[16];
- void *iv;
- int ivsize;
+ char iv[AES_BLOCK_SIZE];
int ret;
int last_byte;
@@ -302,10 +306,13 @@ static int ceph_aes_decrypt(const void *key, int key_len,
if (ret)
goto out_tfm;
- crypto_blkcipher_setkey((void *)tfm, key, key_len);
- iv = crypto_blkcipher_crt(tfm)->iv;
- ivsize = crypto_blkcipher_ivsize(tfm);
- memcpy(iv, aes_iv, ivsize);
+ crypto_skcipher_setkey((void *)tfm, key, key_len);
+ memcpy(iv, aes_iv, AES_BLOCK_SIZE);
+
+ skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, sg_in.sgl, sg_out,
+ src_len, iv);
/*
print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
@@ -313,7 +320,8 @@ static int ceph_aes_decrypt(const void *key, int key_len,
print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
src, src_len, 1);
*/
- ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in.sgl, src_len);
+ ret = crypto_skcipher_decrypt(req);
+ skcipher_request_zero(req);
if (ret < 0) {
pr_err("ceph_aes_decrypt failed %d\n", ret);
goto out_sg;
@@ -338,7 +346,7 @@ static int ceph_aes_decrypt(const void *key, int key_len,
out_sg:
teardown_sgtable(&sg_in);
out_tfm:
- crypto_free_blkcipher(tfm);
+ crypto_free_skcipher(tfm);
return ret;
}
@@ -349,11 +357,10 @@ static int ceph_aes_decrypt2(const void *key, int key_len,
{
struct sg_table sg_in;
struct scatterlist sg_out[3], prealloc_sg;
- struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
- struct blkcipher_desc desc = { .tfm = tfm };
+ struct crypto_skcipher *tfm = ceph_crypto_alloc_cipher();
+ SKCIPHER_REQUEST_ON_STACK(req, tfm);
char pad[16];
- void *iv;
- int ivsize;
+ char iv[AES_BLOCK_SIZE];
int ret;
int last_byte;
@@ -368,10 +375,13 @@ static int ceph_aes_decrypt2(const void *key, int key_len,
if (ret)
goto out_tfm;
- crypto_blkcipher_setkey((void *)tfm, key, key_len);
- iv = crypto_blkcipher_crt(tfm)->iv;
- ivsize = crypto_blkcipher_ivsize(tfm);
- memcpy(iv, aes_iv, ivsize);
+ crypto_skcipher_setkey((void *)tfm, key, key_len);
+ memcpy(iv, aes_iv, AES_BLOCK_SIZE);
+
+ skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, sg_in.sgl, sg_out,
+ src_len, iv);
/*
print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
@@ -379,7 +389,8 @@ static int ceph_aes_decrypt2(const void *key, int key_len,
print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
src, src_len, 1);
*/
- ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in.sgl, src_len);
+ ret = crypto_skcipher_decrypt(req);
+ skcipher_request_zero(req);
if (ret < 0) {
pr_err("ceph_aes_decrypt failed %d\n", ret);
goto out_sg;
@@ -415,7 +426,7 @@ static int ceph_aes_decrypt2(const void *key, int key_len,
out_sg:
teardown_sgtable(&sg_in);
out_tfm:
- crypto_free_blkcipher(tfm);
+ crypto_free_skcipher(tfm);
return ret;
}
diff --git a/net/ceph/crypto.h b/net/ceph/crypto.h
index d1498224c49d..2e9cab09f37b 100644
--- a/net/ceph/crypto.h
+++ b/net/ceph/crypto.h
@@ -16,8 +16,10 @@ struct ceph_crypto_key {
static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
{
- if (key)
+ if (key) {
kfree(key->key);
+ key->key = NULL;
+ }
}
int ceph_crypto_key_clone(struct ceph_crypto_key *dst,
diff --git a/net/ceph/debugfs.c b/net/ceph/debugfs.c
index 593dc2eabcc8..b902fbc7863e 100644
--- a/net/ceph/debugfs.c
+++ b/net/ceph/debugfs.c
@@ -112,15 +112,20 @@ static int monc_show(struct seq_file *s, void *p)
struct ceph_mon_generic_request *req;
struct ceph_mon_client *monc = &client->monc;
struct rb_node *rp;
+ int i;
mutex_lock(&monc->mutex);
- if (monc->have_mdsmap)
- seq_printf(s, "have mdsmap %u\n", (unsigned int)monc->have_mdsmap);
- if (monc->have_osdmap)
- seq_printf(s, "have osdmap %u\n", (unsigned int)monc->have_osdmap);
- if (monc->want_next_osdmap)
- seq_printf(s, "want next osdmap\n");
+ for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
+ seq_printf(s, "have %s %u", ceph_sub_str[i],
+ monc->subs[i].have);
+ if (monc->subs[i].want)
+ seq_printf(s, " want %llu%s",
+ le64_to_cpu(monc->subs[i].item.start),
+ (monc->subs[i].item.flags &
+ CEPH_SUBSCRIBE_ONETIME ? "" : "+"));
+ seq_putc(s, '\n');
+ }
for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
__u16 op;
diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
index b9b0e3b5da49..a5502898ea33 100644
--- a/net/ceph/messenger.c
+++ b/net/ceph/messenger.c
@@ -23,9 +23,6 @@
#include <linux/ceph/pagelist.h>
#include <linux/export.h>
-#define list_entry_next(pos, member) \
- list_entry(pos->member.next, typeof(*pos), member)
-
/*
* Ceph uses the messenger to exchange ceph_msg messages with other
* hosts in the system. The messenger provides ordered and reliable
@@ -238,18 +235,12 @@ static struct workqueue_struct *ceph_msgr_wq;
static int ceph_msgr_slab_init(void)
{
BUG_ON(ceph_msg_cache);
- ceph_msg_cache = kmem_cache_create("ceph_msg",
- sizeof (struct ceph_msg),
- __alignof__(struct ceph_msg), 0, NULL);
-
+ ceph_msg_cache = KMEM_CACHE(ceph_msg, 0);
if (!ceph_msg_cache)
return -ENOMEM;
BUG_ON(ceph_msg_data_cache);
- ceph_msg_data_cache = kmem_cache_create("ceph_msg_data",
- sizeof (struct ceph_msg_data),
- __alignof__(struct ceph_msg_data),
- 0, NULL);
+ ceph_msg_data_cache = KMEM_CACHE(ceph_msg_data, 0);
if (ceph_msg_data_cache)
return 0;
@@ -278,7 +269,7 @@ static void _ceph_msgr_exit(void)
}
BUG_ON(zero_page == NULL);
- page_cache_release(zero_page);
+ put_page(zero_page);
zero_page = NULL;
ceph_msgr_slab_exit();
@@ -291,7 +282,7 @@ int ceph_msgr_init(void)
BUG_ON(zero_page != NULL);
zero_page = ZERO_PAGE(0);
- page_cache_get(zero_page);
+ get_page(zero_page);
/*
* The number of active work items is limited by the number of
@@ -509,7 +500,7 @@ static int ceph_tcp_connect(struct ceph_connection *con)
return ret;
}
- if (con->msgr->tcp_nodelay) {
+ if (ceph_test_opt(from_msgr(con->msgr), TCP_NODELAY)) {
int optval = 1;
ret = kernel_setsockopt(sock, SOL_TCP, TCP_NODELAY,
@@ -637,9 +628,6 @@ static int con_close_socket(struct ceph_connection *con)
static void ceph_msg_remove(struct ceph_msg *msg)
{
list_del_init(&msg->list_head);
- BUG_ON(msg->con == NULL);
- msg->con->ops->put(msg->con);
- msg->con = NULL;
ceph_msg_put(msg);
}
@@ -662,20 +650,21 @@ static void reset_connection(struct ceph_connection *con)
if (con->in_msg) {
BUG_ON(con->in_msg->con != con);
- con->in_msg->con = NULL;
ceph_msg_put(con->in_msg);
con->in_msg = NULL;
- con->ops->put(con);
}
con->connect_seq = 0;
con->out_seq = 0;
if (con->out_msg) {
+ BUG_ON(con->out_msg->con != con);
ceph_msg_put(con->out_msg);
con->out_msg = NULL;
}
con->in_seq = 0;
con->in_seq_acked = 0;
+
+ con->out_skip = 0;
}
/*
@@ -775,6 +764,8 @@ static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
static void con_out_kvec_reset(struct ceph_connection *con)
{
+ BUG_ON(con->out_skip);
+
con->out_kvec_left = 0;
con->out_kvec_bytes = 0;
con->out_kvec_cur = &con->out_kvec[0];
@@ -783,9 +774,9 @@ static void con_out_kvec_reset(struct ceph_connection *con)
static void con_out_kvec_add(struct ceph_connection *con,
size_t size, void *data)
{
- int index;
+ int index = con->out_kvec_left;
- index = con->out_kvec_left;
+ BUG_ON(con->out_skip);
BUG_ON(index >= ARRAY_SIZE(con->out_kvec));
con->out_kvec[index].iov_len = size;
@@ -794,6 +785,27 @@ static void con_out_kvec_add(struct ceph_connection *con,
con->out_kvec_bytes += size;
}
+/*
+ * Chop off a kvec from the end. Return residual number of bytes for
+ * that kvec, i.e. how many bytes would have been written if the kvec
+ * hadn't been nuked.
+ */
+static int con_out_kvec_skip(struct ceph_connection *con)
+{
+ int off = con->out_kvec_cur - con->out_kvec;
+ int skip = 0;
+
+ if (con->out_kvec_bytes > 0) {
+ skip = con->out_kvec[off + con->out_kvec_left - 1].iov_len;
+ BUG_ON(con->out_kvec_bytes < skip);
+ BUG_ON(!con->out_kvec_left);
+ con->out_kvec_bytes -= skip;
+ con->out_kvec_left--;
+ }
+
+ return skip;
+}
+
#ifdef CONFIG_BLOCK
/*
@@ -1046,7 +1058,7 @@ static bool ceph_msg_data_pagelist_advance(struct ceph_msg_data_cursor *cursor,
/* Move on to the next page */
BUG_ON(list_is_last(&cursor->page->lru, &pagelist->head));
- cursor->page = list_entry_next(cursor->page, lru);
+ cursor->page = list_next_entry(cursor->page, lru);
cursor->last_piece = cursor->resid <= PAGE_SIZE;
return true;
@@ -1170,7 +1182,7 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
if (!cursor->resid && cursor->total_resid) {
WARN_ON(!cursor->last_piece);
BUG_ON(list_is_last(&cursor->data->links, cursor->data_head));
- cursor->data = list_entry_next(cursor->data, links);
+ cursor->data = list_next_entry(cursor->data, links);
__ceph_msg_data_cursor_init(cursor);
new_piece = true;
}
@@ -1179,6 +1191,13 @@ static bool ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
return new_piece;
}
+static size_t sizeof_footer(struct ceph_connection *con)
+{
+ return (con->peer_features & CEPH_FEATURE_MSG_AUTH) ?
+ sizeof(struct ceph_msg_footer) :
+ sizeof(struct ceph_msg_footer_old);
+}
+
static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
{
BUG_ON(!msg);
@@ -1196,26 +1215,19 @@ static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
static void prepare_write_message_footer(struct ceph_connection *con)
{
struct ceph_msg *m = con->out_msg;
- int v = con->out_kvec_left;
m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
dout("prepare_write_message_footer %p\n", con);
- con->out_kvec_is_msg = true;
- con->out_kvec[v].iov_base = &m->footer;
+ con_out_kvec_add(con, sizeof_footer(con), &m->footer);
if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
if (con->ops->sign_message)
- con->ops->sign_message(con, m);
+ con->ops->sign_message(m);
else
m->footer.sig = 0;
- con->out_kvec[v].iov_len = sizeof(m->footer);
- con->out_kvec_bytes += sizeof(m->footer);
} else {
m->old_footer.flags = m->footer.flags;
- con->out_kvec[v].iov_len = sizeof(m->old_footer);
- con->out_kvec_bytes += sizeof(m->old_footer);
}
- con->out_kvec_left++;
con->out_more = m->more_to_follow;
con->out_msg_done = true;
}
@@ -1229,7 +1241,6 @@ static void prepare_write_message(struct ceph_connection *con)
u32 crc;
con_out_kvec_reset(con);
- con->out_kvec_is_msg = true;
con->out_msg_done = false;
/* Sneak an ack in there first? If we can get it into the same
@@ -1269,18 +1280,19 @@ static void prepare_write_message(struct ceph_connection *con)
/* tag + hdr + front + middle */
con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
- con_out_kvec_add(con, sizeof (m->hdr), &m->hdr);
+ con_out_kvec_add(con, sizeof(con->out_hdr), &con->out_hdr);
con_out_kvec_add(con, m->front.iov_len, m->front.iov_base);
if (m->middle)
con_out_kvec_add(con, m->middle->vec.iov_len,
m->middle->vec.iov_base);
- /* fill in crc (except data pages), footer */
+ /* fill in hdr crc and finalize hdr */
crc = crc32c(0, &m->hdr, offsetof(struct ceph_msg_header, crc));
con->out_msg->hdr.crc = cpu_to_le32(crc);
- con->out_msg->footer.flags = 0;
+ memcpy(&con->out_hdr, &con->out_msg->hdr, sizeof(con->out_hdr));
+ /* fill in front and middle crc, footer */
crc = crc32c(0, m->front.iov_base, m->front.iov_len);
con->out_msg->footer.front_crc = cpu_to_le32(crc);
if (m->middle) {
@@ -1292,6 +1304,7 @@ static void prepare_write_message(struct ceph_connection *con)
dout("%s front_crc %u middle_crc %u\n", __func__,
le32_to_cpu(con->out_msg->footer.front_crc),
le32_to_cpu(con->out_msg->footer.middle_crc));
+ con->out_msg->footer.flags = 0;
/* is there a data payload? */
con->out_msg->footer.data_crc = 0;
@@ -1432,7 +1445,8 @@ static int prepare_write_connect(struct ceph_connection *con)
dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
con->connect_seq, global_seq, proto);
- con->out_connect.features = cpu_to_le64(con->msgr->supported_features);
+ con->out_connect.features =
+ cpu_to_le64(from_msgr(con->msgr)->supported_features);
con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
con->out_connect.global_seq = cpu_to_le32(global_seq);
@@ -1495,7 +1509,6 @@ static int write_partial_kvec(struct ceph_connection *con)
}
}
con->out_kvec_left = 0;
- con->out_kvec_is_msg = false;
ret = 1;
out:
dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
@@ -1527,7 +1540,7 @@ static int write_partial_message_data(struct ceph_connection *con)
{
struct ceph_msg *msg = con->out_msg;
struct ceph_msg_data_cursor *cursor = &msg->cursor;
- bool do_datacrc = !con->msgr->nocrc;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
u32 crc;
dout("%s %p msg %p\n", __func__, con, msg);
@@ -1552,8 +1565,8 @@ static int write_partial_message_data(struct ceph_connection *con)
bool need_crc;
int ret;
- page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
- &last_piece);
+ page = ceph_msg_data_next(cursor, &page_offset, &length,
+ &last_piece);
ret = ceph_tcp_sendpage(con->sock, page, page_offset,
length, !last_piece);
if (ret <= 0) {
@@ -1564,7 +1577,7 @@ static int write_partial_message_data(struct ceph_connection *con)
}
if (do_datacrc && cursor->need_crc)
crc = ceph_crc32c_page(crc, page, page_offset, length);
- need_crc = ceph_msg_data_advance(&msg->cursor, (size_t)ret);
+ need_crc = ceph_msg_data_advance(cursor, (size_t)ret);
}
dout("%s %p msg %p done\n", __func__, con, msg);
@@ -1587,8 +1600,9 @@ static int write_partial_skip(struct ceph_connection *con)
{
int ret;
+ dout("%s %p %d left\n", __func__, con, con->out_skip);
while (con->out_skip > 0) {
- size_t size = min(con->out_skip, (int) PAGE_CACHE_SIZE);
+ size_t size = min(con->out_skip, (int) PAGE_SIZE);
ret = ceph_tcp_sendpage(con->sock, zero_page, 0, size, true);
if (ret <= 0)
@@ -2005,8 +2019,8 @@ static int process_banner(struct ceph_connection *con)
static int process_connect(struct ceph_connection *con)
{
- u64 sup_feat = con->msgr->supported_features;
- u64 req_feat = con->msgr->required_features;
+ u64 sup_feat = from_msgr(con->msgr)->supported_features;
+ u64 req_feat = from_msgr(con->msgr)->required_features;
u64 server_feat = ceph_sanitize_features(
le64_to_cpu(con->in_reply.features));
int ret;
@@ -2232,7 +2246,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
{
struct ceph_msg *msg = con->in_msg;
struct ceph_msg_data_cursor *cursor = &msg->cursor;
- const bool do_datacrc = !con->msgr->nocrc;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
struct page *page;
size_t page_offset;
size_t length;
@@ -2246,8 +2260,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
if (do_datacrc)
crc = con->in_data_crc;
while (cursor->resid) {
- page = ceph_msg_data_next(&msg->cursor, &page_offset, &length,
- NULL);
+ page = ceph_msg_data_next(cursor, &page_offset, &length, NULL);
ret = ceph_tcp_recvpage(con->sock, page, page_offset, length);
if (ret <= 0) {
if (do_datacrc)
@@ -2258,7 +2271,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
if (do_datacrc)
crc = ceph_crc32c_page(crc, page, page_offset, ret);
- (void) ceph_msg_data_advance(&msg->cursor, (size_t)ret);
+ (void) ceph_msg_data_advance(cursor, (size_t)ret);
}
if (do_datacrc)
con->in_data_crc = crc;
@@ -2278,7 +2291,7 @@ static int read_partial_message(struct ceph_connection *con)
int end;
int ret;
unsigned int front_len, middle_len, data_len;
- bool do_datacrc = !con->msgr->nocrc;
+ bool do_datacrc = !ceph_test_opt(from_msgr(con->msgr), NOCRC);
bool need_sign = (con->peer_features & CEPH_FEATURE_MSG_AUTH);
u64 seq;
u32 crc;
@@ -2317,9 +2330,9 @@ static int read_partial_message(struct ceph_connection *con)
ceph_pr_addr(&con->peer_addr.in_addr),
seq, con->in_seq + 1);
con->in_base_pos = -front_len - middle_len - data_len -
- sizeof(m->footer);
+ sizeof_footer(con);
con->in_tag = CEPH_MSGR_TAG_READY;
- return 0;
+ return 1;
} else if ((s64)seq - (s64)con->in_seq > 1) {
pr_err("read_partial_message bad seq %lld expected %lld\n",
seq, con->in_seq + 1);
@@ -2342,10 +2355,10 @@ static int read_partial_message(struct ceph_connection *con)
/* skip this message */
dout("alloc_msg said skip message\n");
con->in_base_pos = -front_len - middle_len - data_len -
- sizeof(m->footer);
+ sizeof_footer(con);
con->in_tag = CEPH_MSGR_TAG_READY;
con->in_seq++;
- return 0;
+ return 1;
}
BUG_ON(!con->in_msg);
@@ -2384,11 +2397,7 @@ static int read_partial_message(struct ceph_connection *con)
}
/* footer */
- if (need_sign)
- size = sizeof(m->footer);
- else
- size = sizeof(m->old_footer);
-
+ size = sizeof_footer(con);
end += size;
ret = read_partial(con, end, size, &m->footer);
if (ret <= 0)
@@ -2423,7 +2432,7 @@ static int read_partial_message(struct ceph_connection *con)
}
if (need_sign && con->ops->check_message_signature &&
- con->ops->check_message_signature(con, m)) {
+ con->ops->check_message_signature(m)) {
pr_err("read_partial_message %p signature check failed\n", m);
return -EBADMSG;
}
@@ -2438,13 +2447,10 @@ static int read_partial_message(struct ceph_connection *con)
*/
static void process_message(struct ceph_connection *con)
{
- struct ceph_msg *msg;
+ struct ceph_msg *msg = con->in_msg;
BUG_ON(con->in_msg->con != con);
- con->in_msg->con = NULL;
- msg = con->in_msg;
con->in_msg = NULL;
- con->ops->put(con);
/* if first message, set peer_name */
if (con->peer_name.type == 0)
@@ -2513,13 +2519,13 @@ more:
more_kvec:
/* kvec data queued? */
- if (con->out_skip) {
- ret = write_partial_skip(con);
+ if (con->out_kvec_left) {
+ ret = write_partial_kvec(con);
if (ret <= 0)
goto out;
}
- if (con->out_kvec_left) {
- ret = write_partial_kvec(con);
+ if (con->out_skip) {
+ ret = write_partial_skip(con);
if (ret <= 0)
goto out;
}
@@ -2677,7 +2683,7 @@ more:
if (ret <= 0) {
switch (ret) {
case -EBADMSG:
- con->error_msg = "bad crc";
+ con->error_msg = "bad crc/signature";
/* fall through */
case -EBADE:
ret = -EIO;
@@ -2812,13 +2818,17 @@ static bool con_backoff(struct ceph_connection *con)
static void con_fault_finish(struct ceph_connection *con)
{
+ dout("%s %p\n", __func__, con);
+
/*
* in case we faulted due to authentication, invalidate our
* current tickets so that we can get new ones.
*/
- if (con->auth_retry && con->ops->invalidate_authorizer) {
- dout("calling invalidate_authorizer()\n");
- con->ops->invalidate_authorizer(con);
+ if (con->auth_retry) {
+ dout("auth_retry %d, invalidating\n", con->auth_retry);
+ if (con->ops->invalidate_authorizer)
+ con->ops->invalidate_authorizer(con);
+ con->auth_retry = 0;
}
if (con->ops->fault)
@@ -2918,10 +2928,8 @@ static void con_fault(struct ceph_connection *con)
if (con->in_msg) {
BUG_ON(con->in_msg->con != con);
- con->in_msg->con = NULL;
ceph_msg_put(con->in_msg);
con->in_msg = NULL;
- con->ops->put(con);
}
/* Requeue anything that hasn't been acked */
@@ -2952,15 +2960,8 @@ static void con_fault(struct ceph_connection *con)
* initialize a new messenger instance
*/
void ceph_messenger_init(struct ceph_messenger *msgr,
- struct ceph_entity_addr *myaddr,
- u64 supported_features,
- u64 required_features,
- bool nocrc,
- bool tcp_nodelay)
+ struct ceph_entity_addr *myaddr)
{
- msgr->supported_features = supported_features;
- msgr->required_features = required_features;
-
spin_lock_init(&msgr->global_seq_lock);
if (myaddr)
@@ -2970,8 +2971,6 @@ void ceph_messenger_init(struct ceph_messenger *msgr,
msgr->inst.addr.type = 0;
get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
encode_my_addr(msgr);
- msgr->nocrc = nocrc;
- msgr->tcp_nodelay = tcp_nodelay;
atomic_set(&msgr->stopping, 0);
write_pnet(&msgr->net, get_net(current->nsproxy->net_ns));
@@ -2986,6 +2985,15 @@ void ceph_messenger_fini(struct ceph_messenger *msgr)
}
EXPORT_SYMBOL(ceph_messenger_fini);
+static void msg_con_set(struct ceph_msg *msg, struct ceph_connection *con)
+{
+ if (msg->con)
+ msg->con->ops->put(msg->con);
+
+ msg->con = con ? con->ops->get(con) : NULL;
+ BUG_ON(msg->con != con);
+}
+
static void clear_standby(struct ceph_connection *con)
{
/* come back from STANDBY? */
@@ -3017,9 +3025,7 @@ void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
return;
}
- BUG_ON(msg->con != NULL);
- msg->con = con->ops->get(con);
- BUG_ON(msg->con == NULL);
+ msg_con_set(msg, con);
BUG_ON(!list_empty(&msg->list_head));
list_add_tail(&msg->list_head, &con->out_queue);
@@ -3047,31 +3053,42 @@ void ceph_msg_revoke(struct ceph_msg *msg)
{
struct ceph_connection *con = msg->con;
- if (!con)
+ if (!con) {
+ dout("%s msg %p null con\n", __func__, msg);
return; /* Message not in our possession */
+ }
mutex_lock(&con->mutex);
if (!list_empty(&msg->list_head)) {
dout("%s %p msg %p - was on queue\n", __func__, con, msg);
list_del_init(&msg->list_head);
- BUG_ON(msg->con == NULL);
- msg->con->ops->put(msg->con);
- msg->con = NULL;
msg->hdr.seq = 0;
ceph_msg_put(msg);
}
if (con->out_msg == msg) {
- dout("%s %p msg %p - was sending\n", __func__, con, msg);
- con->out_msg = NULL;
- if (con->out_kvec_is_msg) {
- con->out_skip = con->out_kvec_bytes;
- con->out_kvec_is_msg = false;
+ BUG_ON(con->out_skip);
+ /* footer */
+ if (con->out_msg_done) {
+ con->out_skip += con_out_kvec_skip(con);
+ } else {
+ BUG_ON(!msg->data_length);
+ con->out_skip += sizeof_footer(con);
}
+ /* data, middle, front */
+ if (msg->data_length)
+ con->out_skip += msg->cursor.total_resid;
+ if (msg->middle)
+ con->out_skip += con_out_kvec_skip(con);
+ con->out_skip += con_out_kvec_skip(con);
+
+ dout("%s %p msg %p - was sending, will write %d skip %d\n",
+ __func__, con, msg, con->out_kvec_bytes, con->out_skip);
msg->hdr.seq = 0;
-
+ con->out_msg = NULL;
ceph_msg_put(msg);
}
+
mutex_unlock(&con->mutex);
}
@@ -3080,16 +3097,13 @@ void ceph_msg_revoke(struct ceph_msg *msg)
*/
void ceph_msg_revoke_incoming(struct ceph_msg *msg)
{
- struct ceph_connection *con;
+ struct ceph_connection *con = msg->con;
- BUG_ON(msg == NULL);
- if (!msg->con) {
+ if (!con) {
dout("%s msg %p null con\n", __func__, msg);
-
return; /* Message not in our possession */
}
- con = msg->con;
mutex_lock(&con->mutex);
if (con->in_msg == msg) {
unsigned int front_len = le32_to_cpu(con->in_hdr.front_len);
@@ -3335,9 +3349,8 @@ static int ceph_con_in_msg_alloc(struct ceph_connection *con, int *skip)
}
if (msg) {
BUG_ON(*skip);
+ msg_con_set(msg, con);
con->in_msg = msg;
- con->in_msg->con = con->ops->get(con);
- BUG_ON(con->in_msg->con == NULL);
} else {
/*
* Null message pointer means either we should skip
@@ -3377,25 +3390,21 @@ static void ceph_msg_free(struct ceph_msg *m)
static void ceph_msg_release(struct kref *kref)
{
struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
- LIST_HEAD(data);
- struct list_head *links;
- struct list_head *next;
+ struct ceph_msg_data *data, *next;
dout("%s %p\n", __func__, m);
WARN_ON(!list_empty(&m->list_head));
+ msg_con_set(m, NULL);
+
/* drop middle, data, if any */
if (m->middle) {
ceph_buffer_put(m->middle);
m->middle = NULL;
}
- list_splice_init(&m->data, &data);
- list_for_each_safe(links, next, &data) {
- struct ceph_msg_data *data;
-
- data = list_entry(links, struct ceph_msg_data, links);
- list_del_init(links);
+ list_for_each_entry_safe(data, next, &m->data, links) {
+ list_del_init(&data->links);
ceph_msg_data_destroy(data);
}
m->data_length = 0;
diff --git a/net/ceph/mon_client.c b/net/ceph/mon_client.c
index edda01626a45..cf638c009cfa 100644
--- a/net/ceph/mon_client.c
+++ b/net/ceph/mon_client.c
@@ -122,51 +122,91 @@ static void __close_session(struct ceph_mon_client *monc)
ceph_msg_revoke(monc->m_subscribe);
ceph_msg_revoke_incoming(monc->m_subscribe_ack);
ceph_con_close(&monc->con);
- monc->cur_mon = -1;
+
monc->pending_auth = 0;
ceph_auth_reset(monc->auth);
}
/*
- * Open a session with a (new) monitor.
+ * Pick a new monitor at random and set cur_mon. If we are repicking
+ * (i.e. cur_mon is already set), be sure to pick a different one.
*/
-static int __open_session(struct ceph_mon_client *monc)
+static void pick_new_mon(struct ceph_mon_client *monc)
{
- char r;
- int ret;
+ int old_mon = monc->cur_mon;
- if (monc->cur_mon < 0) {
- get_random_bytes(&r, 1);
- monc->cur_mon = r % monc->monmap->num_mon;
- dout("open_session num=%d r=%d -> mon%d\n",
- monc->monmap->num_mon, r, monc->cur_mon);
- monc->sub_sent = 0;
- monc->sub_renew_after = jiffies; /* i.e., expired */
- monc->want_next_osdmap = !!monc->want_next_osdmap;
-
- dout("open_session mon%d opening\n", monc->cur_mon);
- ceph_con_open(&monc->con,
- CEPH_ENTITY_TYPE_MON, monc->cur_mon,
- &monc->monmap->mon_inst[monc->cur_mon].addr);
-
- /* send an initial keepalive to ensure our timestamp is
- * valid by the time we are in an OPENED state */
- ceph_con_keepalive(&monc->con);
-
- /* initiatiate authentication handshake */
- ret = ceph_auth_build_hello(monc->auth,
- monc->m_auth->front.iov_base,
- monc->m_auth->front_alloc_len);
- __send_prepared_auth_request(monc, ret);
+ BUG_ON(monc->monmap->num_mon < 1);
+
+ if (monc->monmap->num_mon == 1) {
+ monc->cur_mon = 0;
} else {
- dout("open_session mon%d already open\n", monc->cur_mon);
+ int max = monc->monmap->num_mon;
+ int o = -1;
+ int n;
+
+ if (monc->cur_mon >= 0) {
+ if (monc->cur_mon < monc->monmap->num_mon)
+ o = monc->cur_mon;
+ if (o >= 0)
+ max--;
+ }
+
+ n = prandom_u32() % max;
+ if (o >= 0 && n >= o)
+ n++;
+
+ monc->cur_mon = n;
}
- return 0;
+
+ dout("%s mon%d -> mon%d out of %d mons\n", __func__, old_mon,
+ monc->cur_mon, monc->monmap->num_mon);
}
-static bool __sub_expired(struct ceph_mon_client *monc)
+/*
+ * Open a session with a new monitor.
+ */
+static void __open_session(struct ceph_mon_client *monc)
{
- return time_after_eq(jiffies, monc->sub_renew_after);
+ int ret;
+
+ pick_new_mon(monc);
+
+ monc->hunting = true;
+ if (monc->had_a_connection) {
+ monc->hunt_mult *= CEPH_MONC_HUNT_BACKOFF;
+ if (monc->hunt_mult > CEPH_MONC_HUNT_MAX_MULT)
+ monc->hunt_mult = CEPH_MONC_HUNT_MAX_MULT;
+ }
+
+ monc->sub_renew_after = jiffies; /* i.e., expired */
+ monc->sub_renew_sent = 0;
+
+ dout("%s opening mon%d\n", __func__, monc->cur_mon);
+ ceph_con_open(&monc->con, CEPH_ENTITY_TYPE_MON, monc->cur_mon,
+ &monc->monmap->mon_inst[monc->cur_mon].addr);
+
+ /*
+ * send an initial keepalive to ensure our timestamp is valid
+ * by the time we are in an OPENED state
+ */
+ ceph_con_keepalive(&monc->con);
+
+ /* initiate authentication handshake */
+ ret = ceph_auth_build_hello(monc->auth,
+ monc->m_auth->front.iov_base,
+ monc->m_auth->front_alloc_len);
+ BUG_ON(ret <= 0);
+ __send_prepared_auth_request(monc, ret);
+}
+
+static void reopen_session(struct ceph_mon_client *monc)
+{
+ if (!monc->hunting)
+ pr_info("mon%d %s session lost, hunting for new mon\n",
+ monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr.in_addr));
+
+ __close_session(monc);
+ __open_session(monc);
}
/*
@@ -174,74 +214,70 @@ static bool __sub_expired(struct ceph_mon_client *monc)
*/
static void __schedule_delayed(struct ceph_mon_client *monc)
{
- struct ceph_options *opt = monc->client->options;
unsigned long delay;
- if (monc->cur_mon < 0 || __sub_expired(monc)) {
- delay = 10 * HZ;
- } else {
- delay = 20 * HZ;
- if (opt->monc_ping_timeout > 0)
- delay = min(delay, opt->monc_ping_timeout / 3);
- }
+ if (monc->hunting)
+ delay = CEPH_MONC_HUNT_INTERVAL * monc->hunt_mult;
+ else
+ delay = CEPH_MONC_PING_INTERVAL;
+
dout("__schedule_delayed after %lu\n", delay);
- schedule_delayed_work(&monc->delayed_work,
- round_jiffies_relative(delay));
+ mod_delayed_work(system_wq, &monc->delayed_work,
+ round_jiffies_relative(delay));
}
+const char *ceph_sub_str[] = {
+ [CEPH_SUB_MDSMAP] = "mdsmap",
+ [CEPH_SUB_MONMAP] = "monmap",
+ [CEPH_SUB_OSDMAP] = "osdmap",
+};
+
/*
- * Send subscribe request for mdsmap and/or osdmap.
+ * Send subscribe request for one or more maps, according to
+ * monc->subs.
*/
static void __send_subscribe(struct ceph_mon_client *monc)
{
- dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
- (unsigned int)monc->sub_sent, __sub_expired(monc),
- monc->want_next_osdmap);
- if ((__sub_expired(monc) && !monc->sub_sent) ||
- monc->want_next_osdmap == 1) {
- struct ceph_msg *msg = monc->m_subscribe;
- struct ceph_mon_subscribe_item *i;
- void *p, *end;
- int num;
-
- p = msg->front.iov_base;
- end = p + msg->front_alloc_len;
-
- num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap;
- ceph_encode_32(&p, num);
-
- if (monc->want_next_osdmap) {
- dout("__send_subscribe to 'osdmap' %u\n",
- (unsigned int)monc->have_osdmap);
- ceph_encode_string(&p, end, "osdmap", 6);
- i = p;
- i->have = cpu_to_le64(monc->have_osdmap);
- i->onetime = 1;
- p += sizeof(*i);
- monc->want_next_osdmap = 2; /* requested */
- }
- if (monc->want_mdsmap) {
- dout("__send_subscribe to 'mdsmap' %u+\n",
- (unsigned int)monc->have_mdsmap);
- ceph_encode_string(&p, end, "mdsmap", 6);
- i = p;
- i->have = cpu_to_le64(monc->have_mdsmap);
- i->onetime = 0;
- p += sizeof(*i);
- }
- ceph_encode_string(&p, end, "monmap", 6);
- i = p;
- i->have = 0;
- i->onetime = 0;
- p += sizeof(*i);
-
- msg->front.iov_len = p - msg->front.iov_base;
- msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
- ceph_msg_revoke(msg);
- ceph_con_send(&monc->con, ceph_msg_get(msg));
-
- monc->sub_sent = jiffies | 1; /* never 0 */
+ struct ceph_msg *msg = monc->m_subscribe;
+ void *p = msg->front.iov_base;
+ void *const end = p + msg->front_alloc_len;
+ int num = 0;
+ int i;
+
+ dout("%s sent %lu\n", __func__, monc->sub_renew_sent);
+
+ BUG_ON(monc->cur_mon < 0);
+
+ if (!monc->sub_renew_sent)
+ monc->sub_renew_sent = jiffies | 1; /* never 0 */
+
+ msg->hdr.version = cpu_to_le16(2);
+
+ for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
+ if (monc->subs[i].want)
+ num++;
+ }
+ BUG_ON(num < 1); /* monmap sub is always there */
+ ceph_encode_32(&p, num);
+ for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
+ const char *s = ceph_sub_str[i];
+
+ if (!monc->subs[i].want)
+ continue;
+
+ dout("%s %s start %llu flags 0x%x\n", __func__, s,
+ le64_to_cpu(monc->subs[i].item.start),
+ monc->subs[i].item.flags);
+ ceph_encode_string(&p, end, s, strlen(s));
+ memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
+ p += sizeof(monc->subs[i].item);
}
+
+ BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ ceph_msg_revoke(msg);
+ ceph_con_send(&monc->con, ceph_msg_get(msg));
}
static void handle_subscribe_ack(struct ceph_mon_client *monc,
@@ -255,15 +291,16 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
seconds = le32_to_cpu(h->duration);
mutex_lock(&monc->mutex);
- if (monc->hunting) {
- pr_info("mon%d %s session established\n",
- monc->cur_mon,
- ceph_pr_addr(&monc->con.peer_addr.in_addr));
- monc->hunting = false;
+ if (monc->sub_renew_sent) {
+ monc->sub_renew_after = monc->sub_renew_sent +
+ (seconds >> 1) * HZ - 1;
+ dout("%s sent %lu duration %d renew after %lu\n", __func__,
+ monc->sub_renew_sent, seconds, monc->sub_renew_after);
+ monc->sub_renew_sent = 0;
+ } else {
+ dout("%s sent %lu renew after %lu, ignoring\n", __func__,
+ monc->sub_renew_sent, monc->sub_renew_after);
}
- dout("handle_subscribe_ack after %d seconds\n", seconds);
- monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
- monc->sub_sent = 0;
mutex_unlock(&monc->mutex);
return;
bad:
@@ -272,36 +309,82 @@ bad:
}
/*
- * Keep track of which maps we have
+ * Register interest in a map
+ *
+ * @sub: one of CEPH_SUB_*
+ * @epoch: X for "every map since X", or 0 for "just the latest"
*/
-int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
+static bool __ceph_monc_want_map(struct ceph_mon_client *monc, int sub,
+ u32 epoch, bool continuous)
+{
+ __le64 start = cpu_to_le64(epoch);
+ u8 flags = !continuous ? CEPH_SUBSCRIBE_ONETIME : 0;
+
+ dout("%s %s epoch %u continuous %d\n", __func__, ceph_sub_str[sub],
+ epoch, continuous);
+
+ if (monc->subs[sub].want &&
+ monc->subs[sub].item.start == start &&
+ monc->subs[sub].item.flags == flags)
+ return false;
+
+ monc->subs[sub].item.start = start;
+ monc->subs[sub].item.flags = flags;
+ monc->subs[sub].want = true;
+
+ return true;
+}
+
+bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
+ bool continuous)
{
+ bool need_request;
+
mutex_lock(&monc->mutex);
- monc->have_mdsmap = got;
+ need_request = __ceph_monc_want_map(monc, sub, epoch, continuous);
mutex_unlock(&monc->mutex);
- return 0;
+
+ return need_request;
+}
+EXPORT_SYMBOL(ceph_monc_want_map);
+
+/*
+ * Keep track of which maps we have
+ *
+ * @sub: one of CEPH_SUB_*
+ */
+static void __ceph_monc_got_map(struct ceph_mon_client *monc, int sub,
+ u32 epoch)
+{
+ dout("%s %s epoch %u\n", __func__, ceph_sub_str[sub], epoch);
+
+ if (monc->subs[sub].want) {
+ if (monc->subs[sub].item.flags & CEPH_SUBSCRIBE_ONETIME)
+ monc->subs[sub].want = false;
+ else
+ monc->subs[sub].item.start = cpu_to_le64(epoch + 1);
+ }
+
+ monc->subs[sub].have = epoch;
}
-EXPORT_SYMBOL(ceph_monc_got_mdsmap);
-int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
+void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
{
mutex_lock(&monc->mutex);
- monc->have_osdmap = got;
- monc->want_next_osdmap = 0;
+ __ceph_monc_got_map(monc, sub, epoch);
mutex_unlock(&monc->mutex);
- return 0;
}
+EXPORT_SYMBOL(ceph_monc_got_map);
/*
* Register interest in the next osdmap
*/
void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
{
- dout("request_next_osdmap have %u\n", monc->have_osdmap);
+ dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
mutex_lock(&monc->mutex);
- if (!monc->want_next_osdmap)
- monc->want_next_osdmap = 1;
- if (monc->want_next_osdmap < 2)
+ if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
+ monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
__send_subscribe(monc);
mutex_unlock(&monc->mutex);
}
@@ -320,15 +403,15 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
long ret;
mutex_lock(&monc->mutex);
- while (monc->have_osdmap < epoch) {
+ while (monc->subs[CEPH_SUB_OSDMAP].have < epoch) {
mutex_unlock(&monc->mutex);
if (timeout && time_after_eq(jiffies, started + timeout))
return -ETIMEDOUT;
ret = wait_event_interruptible_timeout(monc->client->auth_wq,
- monc->have_osdmap >= epoch,
- ceph_timeout_jiffies(timeout));
+ monc->subs[CEPH_SUB_OSDMAP].have >= epoch,
+ ceph_timeout_jiffies(timeout));
if (ret < 0)
return ret;
@@ -341,11 +424,14 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
EXPORT_SYMBOL(ceph_monc_wait_osdmap);
/*
- *
+ * Open a session with a random monitor. Request monmap and osdmap,
+ * which are waited upon in __ceph_open_session().
*/
int ceph_monc_open_session(struct ceph_mon_client *monc)
{
mutex_lock(&monc->mutex);
+ __ceph_monc_want_map(monc, CEPH_SUB_MONMAP, 0, true);
+ __ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, 0, false);
__open_session(monc);
__schedule_delayed(monc);
mutex_unlock(&monc->mutex);
@@ -353,33 +439,15 @@ int ceph_monc_open_session(struct ceph_mon_client *monc)
}
EXPORT_SYMBOL(ceph_monc_open_session);
-/*
- * We require the fsid and global_id in order to initialize our
- * debugfs dir.
- */
-static bool have_debugfs_info(struct ceph_mon_client *monc)
-{
- dout("have_debugfs_info fsid %d globalid %lld\n",
- (int)monc->client->have_fsid, monc->auth->global_id);
- return monc->client->have_fsid && monc->auth->global_id > 0;
-}
-
-/*
- * The monitor responds with mount ack indicate mount success. The
- * included client ticket allows the client to talk to MDSs and OSDs.
- */
static void ceph_monc_handle_map(struct ceph_mon_client *monc,
struct ceph_msg *msg)
{
struct ceph_client *client = monc->client;
struct ceph_monmap *monmap = NULL, *old = monc->monmap;
void *p, *end;
- int had_debugfs_info, init_debugfs = 0;
mutex_lock(&monc->mutex);
- had_debugfs_info = have_debugfs_info(monc);
-
dout("handle_monmap\n");
p = msg->front.iov_base;
end = p + msg->front.iov_len;
@@ -399,29 +467,11 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
client->monc.monmap = monmap;
kfree(old);
- if (!client->have_fsid) {
- client->have_fsid = true;
- if (!had_debugfs_info && have_debugfs_info(monc)) {
- pr_info("client%lld fsid %pU\n",
- ceph_client_id(monc->client),
- &monc->client->fsid);
- init_debugfs = 1;
- }
- mutex_unlock(&monc->mutex);
-
- if (init_debugfs) {
- /*
- * do debugfs initialization without mutex to avoid
- * creating a locking dependency
- */
- ceph_debugfs_client_init(monc->client);
- }
+ __ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch);
+ client->have_fsid = true;
- goto out_unlocked;
- }
out:
mutex_unlock(&monc->mutex);
-out_unlocked:
wake_up_all(&client->auth_wq);
}
@@ -749,18 +799,15 @@ static void delayed_work(struct work_struct *work)
dout("monc delayed_work\n");
mutex_lock(&monc->mutex);
if (monc->hunting) {
- __close_session(monc);
- __open_session(monc); /* continue hunting */
+ dout("%s continuing hunt\n", __func__);
+ reopen_session(monc);
} else {
- struct ceph_options *opt = monc->client->options;
int is_auth = ceph_auth_is_authenticated(monc->auth);
if (ceph_con_keepalive_expired(&monc->con,
- opt->monc_ping_timeout)) {
+ CEPH_MONC_PING_TIMEOUT)) {
dout("monc keepalive timeout\n");
is_auth = 0;
- __close_session(monc);
- monc->hunting = true;
- __open_session(monc);
+ reopen_session(monc);
}
if (!monc->hunting) {
@@ -768,8 +815,14 @@ static void delayed_work(struct work_struct *work)
__validate_auth(monc);
}
- if (is_auth)
- __send_subscribe(monc);
+ if (is_auth) {
+ unsigned long now = jiffies;
+
+ dout("%s renew subs? now %lu renew after %lu\n",
+ __func__, now, monc->sub_renew_after);
+ if (time_after_eq(now, monc->sub_renew_after))
+ __send_subscribe(monc);
+ }
}
__schedule_delayed(monc);
mutex_unlock(&monc->mutex);
@@ -856,18 +909,14 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
&monc->client->msgr);
monc->cur_mon = -1;
- monc->hunting = true;
- monc->sub_renew_after = jiffies;
- monc->sub_sent = 0;
+ monc->had_a_connection = false;
+ monc->hunt_mult = 1;
INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
monc->generic_request_tree = RB_ROOT;
monc->num_generic_requests = 0;
monc->last_tid = 0;
- monc->have_mdsmap = 0;
- monc->have_osdmap = 0;
- monc->want_next_osdmap = 1;
return 0;
out_auth_reply:
@@ -892,7 +941,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
mutex_lock(&monc->mutex);
__close_session(monc);
-
+ monc->cur_mon = -1;
mutex_unlock(&monc->mutex);
/*
@@ -914,26 +963,40 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
}
EXPORT_SYMBOL(ceph_monc_stop);
+static void finish_hunting(struct ceph_mon_client *monc)
+{
+ if (monc->hunting) {
+ dout("%s found mon%d\n", __func__, monc->cur_mon);
+ monc->hunting = false;
+ monc->had_a_connection = true;
+ monc->hunt_mult /= 2; /* reduce by 50% */
+ if (monc->hunt_mult < 1)
+ monc->hunt_mult = 1;
+ }
+}
+
static void handle_auth_reply(struct ceph_mon_client *monc,
struct ceph_msg *msg)
{
int ret;
int was_auth = 0;
- int had_debugfs_info, init_debugfs = 0;
mutex_lock(&monc->mutex);
- had_debugfs_info = have_debugfs_info(monc);
was_auth = ceph_auth_is_authenticated(monc->auth);
monc->pending_auth = 0;
ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
msg->front.iov_len,
monc->m_auth->front.iov_base,
monc->m_auth->front_alloc_len);
+ if (ret > 0) {
+ __send_prepared_auth_request(monc, ret);
+ goto out;
+ }
+
+ finish_hunting(monc);
+
if (ret < 0) {
monc->client->auth_err = ret;
- wake_up_all(&monc->client->auth_wq);
- } else if (ret > 0) {
- __send_prepared_auth_request(monc, ret);
} else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
dout("authenticated, starting session\n");
@@ -943,23 +1006,15 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
__send_subscribe(monc);
__resend_generic_request(monc);
- }
- if (!had_debugfs_info && have_debugfs_info(monc)) {
- pr_info("client%lld fsid %pU\n",
- ceph_client_id(monc->client),
- &monc->client->fsid);
- init_debugfs = 1;
+ pr_info("mon%d %s session established\n", monc->cur_mon,
+ ceph_pr_addr(&monc->con.peer_addr.in_addr));
}
- mutex_unlock(&monc->mutex);
- if (init_debugfs) {
- /*
- * do debugfs initialization without mutex to avoid
- * creating a locking dependency
- */
- ceph_debugfs_client_init(monc->client);
- }
+out:
+ mutex_unlock(&monc->mutex);
+ if (monc->client->auth_err < 0)
+ wake_up_all(&monc->client->auth_wq);
}
static int __validate_auth(struct ceph_mon_client *monc)
@@ -1100,29 +1155,17 @@ static void mon_fault(struct ceph_connection *con)
{
struct ceph_mon_client *monc = con->private;
- if (!monc)
- return;
-
- dout("mon_fault\n");
mutex_lock(&monc->mutex);
- if (!con->private)
- goto out;
-
- if (!monc->hunting)
- pr_info("mon%d %s session lost, "
- "hunting for new mon\n", monc->cur_mon,
- ceph_pr_addr(&monc->con.peer_addr.in_addr));
-
- __close_session(monc);
- if (!monc->hunting) {
- /* start hunting */
- monc->hunting = true;
- __open_session(monc);
- } else {
- /* already hunting, let's wait a bit */
- __schedule_delayed(monc);
+ dout("%s mon%d\n", __func__, monc->cur_mon);
+ if (monc->cur_mon >= 0) {
+ if (!monc->hunting) {
+ dout("%s hunting for new mon\n", __func__);
+ reopen_session(monc);
+ __schedule_delayed(monc);
+ } else {
+ dout("%s already hunting\n", __func__);
+ }
}
-out:
mutex_unlock(&monc->mutex);
}
diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
index f79ccac6699f..32355d9d0103 100644
--- a/net/ceph/osd_client.c
+++ b/net/ceph/osd_client.c
@@ -120,11 +120,13 @@ static void ceph_osd_data_bio_init(struct ceph_osd_data *osd_data,
}
#endif /* CONFIG_BLOCK */
-#define osd_req_op_data(oreq, whch, typ, fld) \
- ({ \
- BUG_ON(whch >= (oreq)->r_num_ops); \
- &(oreq)->r_ops[whch].typ.fld; \
- })
+#define osd_req_op_data(oreq, whch, typ, fld) \
+({ \
+ struct ceph_osd_request *__oreq = (oreq); \
+ unsigned int __whch = (whch); \
+ BUG_ON(__whch >= __oreq->r_num_ops); \
+ &__oreq->r_ops[__whch].typ.fld; \
+})
static struct ceph_osd_data *
osd_req_op_raw_data_in(struct ceph_osd_request *osd_req, unsigned int which)
@@ -336,9 +338,10 @@ static void ceph_osdc_release_request(struct kref *kref)
ceph_put_snap_context(req->r_snapc);
if (req->r_mempool)
mempool_free(req, req->r_osdc->req_mempool);
- else
+ else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
kmem_cache_free(ceph_osd_request_cache, req);
-
+ else
+ kfree(req);
}
void ceph_osdc_get_request(struct ceph_osd_request *req)
@@ -367,28 +370,22 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
struct ceph_msg *msg;
size_t msg_size;
- BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX);
- BUG_ON(num_ops > CEPH_OSD_MAX_OP);
-
- msg_size = 4 + 4 + 8 + 8 + 4+8;
- msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
- msg_size += 1 + 8 + 4 + 4; /* pg_t */
- msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
- msg_size += 2 + num_ops*sizeof(struct ceph_osd_op);
- msg_size += 8; /* snapid */
- msg_size += 8; /* snap_seq */
- msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
- msg_size += 4;
-
if (use_mempool) {
+ BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
req = mempool_alloc(osdc->req_mempool, gfp_flags);
- memset(req, 0, sizeof(*req));
+ } else if (num_ops <= CEPH_OSD_SLAB_OPS) {
+ req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags);
} else {
- req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags);
+ BUG_ON(num_ops > CEPH_OSD_MAX_OPS);
+ req = kmalloc(sizeof(*req) + num_ops * sizeof(req->r_ops[0]),
+ gfp_flags);
}
- if (req == NULL)
+ if (unlikely(!req))
return NULL;
+ /* req only, each op is zeroed in _osd_req_op_init() */
+ memset(req, 0, sizeof(*req));
+
req->r_osdc = osdc;
req->r_mempool = use_mempool;
req->r_num_ops = num_ops;
@@ -406,18 +403,36 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
req->r_base_oloc.pool = -1;
req->r_target_oloc.pool = -1;
+ msg_size = OSD_OPREPLY_FRONT_LEN;
+ if (num_ops > CEPH_OSD_SLAB_OPS) {
+ /* ceph_osd_op and rval */
+ msg_size += (num_ops - CEPH_OSD_SLAB_OPS) *
+ (sizeof(struct ceph_osd_op) + 4);
+ }
+
/* create reply message */
if (use_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
else
- msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
- OSD_OPREPLY_FRONT_LEN, gfp_flags, true);
+ msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
+ gfp_flags, true);
if (!msg) {
ceph_osdc_put_request(req);
return NULL;
}
req->r_reply = msg;
+ msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
+ msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
+ msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
+ msg_size += 1 + 8 + 4 + 4; /* pgid */
+ msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
+ msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
+ msg_size += 8; /* snapid */
+ msg_size += 8; /* snap_seq */
+ msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
+ msg_size += 4; /* retry_attempt */
+
/* create request message; allow space for oid */
if (use_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
@@ -496,7 +511,7 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL)
payload_len += length;
- op->payload_len = payload_len;
+ op->indata_len = payload_len;
}
EXPORT_SYMBOL(osd_req_op_extent_init);
@@ -515,10 +530,32 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
BUG_ON(length > previous);
op->extent.length = length;
- op->payload_len -= previous - length;
+ op->indata_len -= previous - length;
}
EXPORT_SYMBOL(osd_req_op_extent_update);
+void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
+ unsigned int which, u64 offset_inc)
+{
+ struct ceph_osd_req_op *op, *prev_op;
+
+ BUG_ON(which + 1 >= osd_req->r_num_ops);
+
+ prev_op = &osd_req->r_ops[which];
+ op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags);
+ /* dup previous one */
+ op->indata_len = prev_op->indata_len;
+ op->outdata_len = prev_op->outdata_len;
+ op->extent = prev_op->extent;
+ /* adjust offset */
+ op->extent.offset += offset_inc;
+ op->extent.length -= offset_inc;
+
+ if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
+ op->indata_len -= offset_inc;
+}
+EXPORT_SYMBOL(osd_req_op_extent_dup_last);
+
void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
u16 opcode, const char *class, const char *method)
{
@@ -552,7 +589,7 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
op->cls.argc = 0; /* currently unused */
- op->payload_len = payload_len;
+ op->indata_len = payload_len;
}
EXPORT_SYMBOL(osd_req_op_cls_init);
@@ -585,7 +622,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
op->xattr.cmp_mode = cmp_mode;
ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
- op->payload_len = payload_len;
+ op->indata_len = payload_len;
return 0;
}
EXPORT_SYMBOL(osd_req_op_xattr_init);
@@ -705,7 +742,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
dst->cls.indata_len = cpu_to_le32(data_length);
ceph_osdc_msg_data_add(req->r_request, osd_data);
- src->payload_len += data_length;
+ src->indata_len += data_length;
request_data_len += data_length;
}
osd_data = &src->cls.response_data;
@@ -748,7 +785,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
dst->op = cpu_to_le16(src->op);
dst->flags = cpu_to_le32(src->flags);
- dst->payload_len = cpu_to_le32(src->payload_len);
+ dst->payload_len = cpu_to_le32(src->indata_len);
return request_data_len;
}
@@ -1750,8 +1787,7 @@ static void complete_request(struct ceph_osd_request *req)
* handle osd op reply. either call the callback if it is specified,
* or do the completion to wake up the waiting thread.
*/
-static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
- struct ceph_connection *con)
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
{
void *p, *end;
struct ceph_osd_request *req;
@@ -1769,6 +1805,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
u32 osdmap_epoch;
int already_completed;
u32 bytes;
+ u8 decode_redir;
unsigned int i;
tid = le64_to_cpu(msg->hdr.tid);
@@ -1808,7 +1845,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
ceph_decode_need(&p, end, 4, bad_put);
numops = ceph_decode_32(&p);
- if (numops > CEPH_OSD_MAX_OP)
+ if (numops > CEPH_OSD_MAX_OPS)
goto bad_put;
if (numops != req->r_num_ops)
goto bad_put;
@@ -1819,7 +1856,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
int len;
len = le32_to_cpu(op->payload_len);
- req->r_reply_op_len[i] = len;
+ req->r_ops[i].outdata_len = len;
dout(" op %d has %d bytes\n", i, len);
payload_len += len;
p += sizeof(*op);
@@ -1834,12 +1871,21 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
retry_attempt = ceph_decode_32(&p);
for (i = 0; i < numops; i++)
- req->r_reply_op_result[i] = ceph_decode_32(&p);
+ req->r_ops[i].rval = ceph_decode_32(&p);
if (le16_to_cpu(msg->hdr.version) >= 6) {
p += 8 + 4; /* skip replay_version */
p += 8; /* skip user_version */
+ if (le16_to_cpu(msg->hdr.version) >= 7)
+ ceph_decode_8_safe(&p, end, decode_redir, bad_put);
+ else
+ decode_redir = 1;
+ } else {
+ decode_redir = 0;
+ }
+
+ if (decode_redir) {
err = ceph_redirect_decode(&p, end, &redir);
if (err)
goto bad_put;
@@ -2176,7 +2222,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
goto bad;
done:
downgrade_write(&osdc->map_sem);
- ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
+ ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
+ osdc->osdmap->epoch);
/*
* subscribe to subsequent osdmap updates if full to ensure
@@ -2635,8 +2682,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
round_jiffies_relative(osdc->client->options->osd_idle_ttl));
err = -ENOMEM;
- osdc->req_mempool = mempool_create_kmalloc_pool(10,
- sizeof(struct ceph_osd_request));
+ osdc->req_mempool = mempool_create_slab_pool(10,
+ ceph_osd_request_cache);
if (!osdc->req_mempool)
goto out;
@@ -2771,11 +2818,12 @@ EXPORT_SYMBOL(ceph_osdc_writepages);
int ceph_osdc_setup(void)
{
+ size_t size = sizeof(struct ceph_osd_request) +
+ CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
+
BUG_ON(ceph_osd_request_cache);
- ceph_osd_request_cache = kmem_cache_create("ceph_osd_request",
- sizeof (struct ceph_osd_request),
- __alignof__(struct ceph_osd_request),
- 0, NULL);
+ ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size,
+ 0, 0, NULL);
return ceph_osd_request_cache ? 0 : -ENOMEM;
}
@@ -2807,7 +2855,7 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
ceph_osdc_handle_map(osdc, msg);
break;
case CEPH_MSG_OSD_OPREPLY:
- handle_reply(osdc, msg, con);
+ handle_reply(osdc, msg);
break;
case CEPH_MSG_WATCH_NOTIFY:
handle_watch_notify(osdc, msg);
@@ -2842,16 +2890,13 @@ static struct ceph_msg *get_reply(struct ceph_connection *con,
mutex_lock(&osdc->request_mutex);
req = __lookup_request(osdc, tid);
if (!req) {
- pr_warn("%s osd%d tid %llu unknown, skipping\n",
- __func__, osd->o_osd, tid);
+ dout("%s osd%d tid %llu unknown, skipping\n", __func__,
+ osd->o_osd, tid);
m = NULL;
*skip = 1;
goto out;
}
- if (req->r_reply->con)
- dout("%s revoking msg %p from old con %p\n", __func__,
- req->r_reply, req->r_reply->con);
ceph_msg_revoke_incoming(req->r_reply);
if (front_len > req->r_reply->front_alloc_len) {
@@ -2978,17 +3023,19 @@ static int invalidate_authorizer(struct ceph_connection *con)
return ceph_monc_validate_auth(&osdc->client->monc);
}
-static int sign_message(struct ceph_connection *con, struct ceph_msg *msg)
+static int osd_sign_message(struct ceph_msg *msg)
{
- struct ceph_osd *o = con->private;
+ struct ceph_osd *o = msg->con->private;
struct ceph_auth_handshake *auth = &o->o_auth;
+
return ceph_auth_sign_message(auth, msg);
}
-static int check_message_signature(struct ceph_connection *con, struct ceph_msg *msg)
+static int osd_check_message_signature(struct ceph_msg *msg)
{
- struct ceph_osd *o = con->private;
+ struct ceph_osd *o = msg->con->private;
struct ceph_auth_handshake *auth = &o->o_auth;
+
return ceph_auth_check_message_signature(auth, msg);
}
@@ -3000,7 +3047,7 @@ static const struct ceph_connection_operations osd_con_ops = {
.verify_authorizer_reply = verify_authorizer_reply,
.invalidate_authorizer = invalidate_authorizer,
.alloc_msg = alloc_msg,
- .sign_message = sign_message,
- .check_message_signature = check_message_signature,
+ .sign_message = osd_sign_message,
+ .check_message_signature = osd_check_message_signature,
.fault = osd_reset,
};
diff --git a/net/ceph/osdmap.c b/net/ceph/osdmap.c
index 7d8f581d9f1f..243574c8cf33 100644
--- a/net/ceph/osdmap.c
+++ b/net/ceph/osdmap.c
@@ -342,23 +342,32 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
c->choose_local_tries = ceph_decode_32(p);
c->choose_local_fallback_tries = ceph_decode_32(p);
c->choose_total_tries = ceph_decode_32(p);
- dout("crush decode tunable choose_local_tries = %d",
+ dout("crush decode tunable choose_local_tries = %d\n",
c->choose_local_tries);
- dout("crush decode tunable choose_local_fallback_tries = %d",
+ dout("crush decode tunable choose_local_fallback_tries = %d\n",
c->choose_local_fallback_tries);
- dout("crush decode tunable choose_total_tries = %d",
+ dout("crush decode tunable choose_total_tries = %d\n",
c->choose_total_tries);
ceph_decode_need(p, end, sizeof(u32), done);
c->chooseleaf_descend_once = ceph_decode_32(p);
- dout("crush decode tunable chooseleaf_descend_once = %d",
+ dout("crush decode tunable chooseleaf_descend_once = %d\n",
c->chooseleaf_descend_once);
ceph_decode_need(p, end, sizeof(u8), done);
c->chooseleaf_vary_r = ceph_decode_8(p);
- dout("crush decode tunable chooseleaf_vary_r = %d",
+ dout("crush decode tunable chooseleaf_vary_r = %d\n",
c->chooseleaf_vary_r);
+ /* skip straw_calc_version, allowed_bucket_algs */
+ ceph_decode_need(p, end, sizeof(u8) + sizeof(u32), done);
+ *p += sizeof(u8) + sizeof(u32);
+
+ ceph_decode_need(p, end, sizeof(u8), done);
+ c->chooseleaf_stable = ceph_decode_8(p);
+ dout("crush decode tunable chooseleaf_stable = %d\n",
+ c->chooseleaf_stable);
+
done:
dout("crush_decode success\n");
return c;
diff --git a/net/ceph/pagelist.c b/net/ceph/pagelist.c
index c7c220a736e5..6864007e64fc 100644
--- a/net/ceph/pagelist.c
+++ b/net/ceph/pagelist.c
@@ -56,7 +56,7 @@ int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
size_t bit = pl->room;
int ret;
- memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+ memcpy(pl->mapped_tail + (pl->length & ~PAGE_MASK),
buf, bit);
pl->length += bit;
pl->room -= bit;
@@ -67,7 +67,7 @@ int ceph_pagelist_append(struct ceph_pagelist *pl, const void *buf, size_t len)
return ret;
}
- memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+ memcpy(pl->mapped_tail + (pl->length & ~PAGE_MASK), buf, len);
pl->length += len;
pl->room -= len;
return 0;
diff --git a/net/ceph/pagevec.c b/net/ceph/pagevec.c
index d4f5f220a8e5..00d2601407c5 100644
--- a/net/ceph/pagevec.c
+++ b/net/ceph/pagevec.c
@@ -24,7 +24,7 @@ struct page **ceph_get_direct_page_vector(const void __user *data,
return ERR_PTR(-ENOMEM);
while (got < num_pages) {
- rc = get_user_pages_unlocked(current, current->mm,
+ rc = get_user_pages_unlocked(
(unsigned long)data + ((unsigned long)got * PAGE_SIZE),
num_pages - got, write_page, 0, pages + got);
if (rc < 0)
@@ -95,19 +95,19 @@ int ceph_copy_user_to_page_vector(struct page **pages,
loff_t off, size_t len)
{
int i = 0;
- int po = off & ~PAGE_CACHE_MASK;
+ int po = off & ~PAGE_MASK;
int left = len;
int l, bad;
while (left > 0) {
- l = min_t(int, PAGE_CACHE_SIZE-po, left);
+ l = min_t(int, PAGE_SIZE-po, left);
bad = copy_from_user(page_address(pages[i]) + po, data, l);
if (bad == l)
return -EFAULT;
data += l - bad;
left -= l - bad;
po += l - bad;
- if (po == PAGE_CACHE_SIZE) {
+ if (po == PAGE_SIZE) {
po = 0;
i++;
}
@@ -121,17 +121,17 @@ void ceph_copy_to_page_vector(struct page **pages,
loff_t off, size_t len)
{
int i = 0;
- size_t po = off & ~PAGE_CACHE_MASK;
+ size_t po = off & ~PAGE_MASK;
size_t left = len;
while (left > 0) {
- size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+ size_t l = min_t(size_t, PAGE_SIZE-po, left);
memcpy(page_address(pages[i]) + po, data, l);
data += l;
left -= l;
po += l;
- if (po == PAGE_CACHE_SIZE) {
+ if (po == PAGE_SIZE) {
po = 0;
i++;
}
@@ -144,17 +144,17 @@ void ceph_copy_from_page_vector(struct page **pages,
loff_t off, size_t len)
{
int i = 0;
- size_t po = off & ~PAGE_CACHE_MASK;
+ size_t po = off & ~PAGE_MASK;
size_t left = len;
while (left > 0) {
- size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
+ size_t l = min_t(size_t, PAGE_SIZE-po, left);
memcpy(data, page_address(pages[i]) + po, l);
data += l;
left -= l;
po += l;
- if (po == PAGE_CACHE_SIZE) {
+ if (po == PAGE_SIZE) {
po = 0;
i++;
}
@@ -168,25 +168,25 @@ EXPORT_SYMBOL(ceph_copy_from_page_vector);
*/
void ceph_zero_page_vector_range(int off, int len, struct page **pages)
{
- int i = off >> PAGE_CACHE_SHIFT;
+ int i = off >> PAGE_SHIFT;
- off &= ~PAGE_CACHE_MASK;
+ off &= ~PAGE_MASK;
dout("zero_page_vector_page %u~%u\n", off, len);
/* leading partial page? */
if (off) {
- int end = min((int)PAGE_CACHE_SIZE, off + len);
+ int end = min((int)PAGE_SIZE, off + len);
dout("zeroing %d %p head from %d\n", i, pages[i],
(int)off);
zero_user_segment(pages[i], off, end);
len -= (end - off);
i++;
}
- while (len >= PAGE_CACHE_SIZE) {
+ while (len >= PAGE_SIZE) {
dout("zeroing %d %p len=%d\n", i, pages[i], len);
- zero_user_segment(pages[i], 0, PAGE_CACHE_SIZE);
- len -= PAGE_CACHE_SIZE;
+ zero_user_segment(pages[i], 0, PAGE_SIZE);
+ len -= PAGE_SIZE;
i++;
}
/* trailing partial page? */
diff --git a/net/core/Makefile b/net/core/Makefile
index 086b01fbe1bd..d6508c2ddca5 100644
--- a/net/core/Makefile
+++ b/net/core/Makefile
@@ -9,7 +9,7 @@ obj-$(CONFIG_SYSCTL) += sysctl_net_core.o
obj-y += dev.o ethtool.o dev_addr_lists.o dst.o netevent.o \
neighbour.o rtnetlink.o utils.o link_watch.o filter.o \
- sock_diag.o dev_ioctl.o tso.o
+ sock_diag.o dev_ioctl.o tso.o sock_reuseport.o
obj-$(CONFIG_XFRM) += flow.o
obj-y += net-sysfs.o
@@ -24,3 +24,6 @@ obj-$(CONFIG_NET_PTP_CLASSIFY) += ptp_classifier.o
obj-$(CONFIG_CGROUP_NET_PRIO) += netprio_cgroup.o
obj-$(CONFIG_CGROUP_NET_CLASSID) += netclassid_cgroup.o
obj-$(CONFIG_LWTUNNEL) += lwtunnel.o
+obj-$(CONFIG_DST_CACHE) += dst_cache.o
+obj-$(CONFIG_HWBM) += hwbm.o
+obj-$(CONFIG_NET_DEVLINK) += devlink.o
diff --git a/net/core/datagram.c b/net/core/datagram.c
index 617088aee21d..fa9dc6450b08 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -83,8 +83,8 @@ static int receiver_wake_function(wait_queue_t *wait, unsigned int mode, int syn
/*
* Wait for the last received packet to be different from skb
*/
-static int wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
- const struct sk_buff *skb)
+int __skb_wait_for_more_packets(struct sock *sk, int *err, long *timeo_p,
+ const struct sk_buff *skb)
{
int error;
DEFINE_WAIT_FUNC(wait, receiver_wake_function);
@@ -130,6 +130,7 @@ out_noerr:
error = 1;
goto out;
}
+EXPORT_SYMBOL(__skb_wait_for_more_packets);
static struct sk_buff *skb_set_peeked(struct sk_buff *skb)
{
@@ -161,13 +162,15 @@ done:
}
/**
- * __skb_recv_datagram - Receive a datagram skbuff
+ * __skb_try_recv_datagram - Receive a datagram skbuff
* @sk: socket
* @flags: MSG_ flags
* @peeked: returns non-zero if this packet has been seen before
* @off: an offset in bytes to peek skb from. Returns an offset
* within an skb where data actually starts
* @err: error code returned
+ * @last: set to last peeked message to inform the wait function
+ * what to look for when peeking
*
* Get a datagram skbuff, understands the peeking, nonblocking wakeups
* and possible races. This replaces identical code in packet, raw and
@@ -175,9 +178,11 @@ done:
* the long standing peek and read race for datagram sockets. If you
* alter this routine remember it must be re-entrant.
*
- * This function will lock the socket if a skb is returned, so the caller
- * needs to unlock the socket in that case (usually by calling
- * skb_free_datagram)
+ * This function will lock the socket if a skb is returned, so
+ * the caller needs to unlock the socket in that case (usually by
+ * calling skb_free_datagram). Returns NULL with *err set to
+ * -EAGAIN if no data was available or to some other value if an
+ * error was detected.
*
* * It does not lock socket since today. This function is
* * free of race conditions. This measure should/can improve
@@ -191,13 +196,13 @@ done:
* quite explicitly by POSIX 1003.1g, don't change them without having
* the standard around please.
*/
-struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
- int *peeked, int *off, int *err)
+struct sk_buff *__skb_try_recv_datagram(struct sock *sk, unsigned int flags,
+ int *peeked, int *off, int *err,
+ struct sk_buff **last)
{
struct sk_buff_head *queue = &sk->sk_receive_queue;
- struct sk_buff *skb, *last;
+ struct sk_buff *skb;
unsigned long cpu_flags;
- long timeo;
/*
* Caller is allowed not to check sk->sk_err before skb_recv_datagram()
*/
@@ -206,8 +211,6 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
if (error)
goto no_packet;
- timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
-
do {
/* Again only user level code calls this function, so nothing
* interrupt level will suddenly eat the receive_queue.
@@ -217,10 +220,10 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
*/
int _off = *off;
- last = (struct sk_buff *)queue;
+ *last = (struct sk_buff *)queue;
spin_lock_irqsave(&queue->lock, cpu_flags);
skb_queue_walk(queue, skb) {
- last = skb;
+ *last = skb;
*peeked = skb->peeked;
if (flags & MSG_PEEK) {
if (_off >= skb->len && (skb->len || _off ||
@@ -231,8 +234,11 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
skb = skb_set_peeked(skb);
error = PTR_ERR(skb);
- if (IS_ERR(skb))
- goto unlock_err;
+ if (IS_ERR(skb)) {
+ spin_unlock_irqrestore(&queue->lock,
+ cpu_flags);
+ goto no_packet;
+ }
atomic_inc(&skb->users);
} else
@@ -242,25 +248,38 @@ struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
*off = _off;
return skb;
}
+
spin_unlock_irqrestore(&queue->lock, cpu_flags);
+ } while (sk_can_busy_loop(sk) &&
+ sk_busy_loop(sk, flags & MSG_DONTWAIT));
- if (sk_can_busy_loop(sk) &&
- sk_busy_loop(sk, flags & MSG_DONTWAIT))
- continue;
+ error = -EAGAIN;
- /* User doesn't want to wait */
- error = -EAGAIN;
- if (!timeo)
- goto no_packet;
+no_packet:
+ *err = error;
+ return NULL;
+}
+EXPORT_SYMBOL(__skb_try_recv_datagram);
- } while (!wait_for_more_packets(sk, err, &timeo, last));
+struct sk_buff *__skb_recv_datagram(struct sock *sk, unsigned int flags,
+ int *peeked, int *off, int *err)
+{
+ struct sk_buff *skb, *last;
+ long timeo;
- return NULL;
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ do {
+ skb = __skb_try_recv_datagram(sk, flags, peeked, off, err,
+ &last);
+ if (skb)
+ return skb;
+
+ if (*err != -EAGAIN)
+ break;
+ } while (timeo &&
+ !__skb_wait_for_more_packets(sk, err, &timeo, last));
-unlock_err:
- spin_unlock_irqrestore(&queue->lock, cpu_flags);
-no_packet:
- *err = error;
return NULL;
}
EXPORT_SYMBOL(__skb_recv_datagram);
@@ -785,7 +804,7 @@ unsigned int datagram_poll(struct file *file, struct socket *sock,
if (sock_writeable(sk))
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
else
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
return mask;
}
diff --git a/net/core/dev.c b/net/core/dev.c
index 8ce3f74cd6b9..77a71cd68535 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -96,6 +96,7 @@
#include <linux/skbuff.h>
#include <net/net_namespace.h>
#include <net/sock.h>
+#include <net/busy_poll.h>
#include <linux/rtnetlink.h>
#include <linux/stat.h>
#include <net/dst.h>
@@ -137,6 +138,7 @@
#include <linux/errqueue.h>
#include <linux/hrtimer.h>
#include <linux/netfilter_ingress.h>
+#include <linux/sctp.h>
#include "net-sysfs.h"
@@ -182,8 +184,8 @@ EXPORT_SYMBOL(dev_base_lock);
/* protects napi_hash addition/deletion and napi_gen_id */
static DEFINE_SPINLOCK(napi_hash_lock);
-static unsigned int napi_gen_id;
-static DEFINE_HASHTABLE(napi_hash, 8);
+static unsigned int napi_gen_id = NR_CPUS;
+static DEFINE_READ_MOSTLY_HASHTABLE(napi_hash, 8);
static seqcount_t devnet_rename_seq;
@@ -1674,6 +1676,22 @@ void net_dec_ingress_queue(void)
EXPORT_SYMBOL_GPL(net_dec_ingress_queue);
#endif
+#ifdef CONFIG_NET_EGRESS
+static struct static_key egress_needed __read_mostly;
+
+void net_inc_egress_queue(void)
+{
+ static_key_slow_inc(&egress_needed);
+}
+EXPORT_SYMBOL_GPL(net_inc_egress_queue);
+
+void net_dec_egress_queue(void)
+{
+ static_key_slow_dec(&egress_needed);
+}
+EXPORT_SYMBOL_GPL(net_dec_egress_queue);
+#endif
+
static struct static_key netstamp_needed __read_mostly;
#ifdef HAVE_JUMP_LABEL
/* We are not allowed to call static_key_slow_dec() from irq context
@@ -2403,17 +2421,20 @@ static void skb_warn_bad_offload(const struct sk_buff *skb)
{
static const netdev_features_t null_features = 0;
struct net_device *dev = skb->dev;
- const char *driver = "";
+ const char *name = "";
if (!net_ratelimit())
return;
- if (dev && dev->dev.parent)
- driver = dev_driver_string(dev->dev.parent);
-
+ if (dev) {
+ if (dev->dev.parent)
+ name = dev_driver_string(dev->dev.parent);
+ else
+ name = netdev_name(dev);
+ }
WARN(1, "%s: caps=(%pNF, %pNF) len=%d data_len=%d gso_size=%d "
"gso_type=%d ip_summed=%d\n",
- driver, dev ? &dev->features : &null_features,
+ name, dev ? &dev->features : &null_features,
skb->sk ? &skb->sk->sk_route_caps : &null_features,
skb->len, skb->data_len, skb_shinfo(skb)->gso_size,
skb_shinfo(skb)->gso_type, skb->ip_summed);
@@ -2467,6 +2488,141 @@ out:
}
EXPORT_SYMBOL(skb_checksum_help);
+/* skb_csum_offload_check - Driver helper function to determine if a device
+ * with limited checksum offload capabilities is able to offload the checksum
+ * for a given packet.
+ *
+ * Arguments:
+ * skb - sk_buff for the packet in question
+ * spec - contains the description of what device can offload
+ * csum_encapped - returns true if the checksum being offloaded is
+ * encpasulated. That is it is checksum for the transport header
+ * in the inner headers.
+ * checksum_help - when set indicates that helper function should
+ * call skb_checksum_help if offload checks fail
+ *
+ * Returns:
+ * true: Packet has passed the checksum checks and should be offloadable to
+ * the device (a driver may still need to check for additional
+ * restrictions of its device)
+ * false: Checksum is not offloadable. If checksum_help was set then
+ * skb_checksum_help was called to resolve checksum for non-GSO
+ * packets and when IP protocol is not SCTP
+ */
+bool __skb_csum_offload_chk(struct sk_buff *skb,
+ const struct skb_csum_offl_spec *spec,
+ bool *csum_encapped,
+ bool csum_help)
+{
+ struct iphdr *iph;
+ struct ipv6hdr *ipv6;
+ void *nhdr;
+ int protocol;
+ u8 ip_proto;
+
+ if (skb->protocol == htons(ETH_P_8021Q) ||
+ skb->protocol == htons(ETH_P_8021AD)) {
+ if (!spec->vlan_okay)
+ goto need_help;
+ }
+
+ /* We check whether the checksum refers to a transport layer checksum in
+ * the outermost header or an encapsulated transport layer checksum that
+ * corresponds to the inner headers of the skb. If the checksum is for
+ * something else in the packet we need help.
+ */
+ if (skb_checksum_start_offset(skb) == skb_transport_offset(skb)) {
+ /* Non-encapsulated checksum */
+ protocol = eproto_to_ipproto(vlan_get_protocol(skb));
+ nhdr = skb_network_header(skb);
+ *csum_encapped = false;
+ if (spec->no_not_encapped)
+ goto need_help;
+ } else if (skb->encapsulation && spec->encap_okay &&
+ skb_checksum_start_offset(skb) ==
+ skb_inner_transport_offset(skb)) {
+ /* Encapsulated checksum */
+ *csum_encapped = true;
+ switch (skb->inner_protocol_type) {
+ case ENCAP_TYPE_ETHER:
+ protocol = eproto_to_ipproto(skb->inner_protocol);
+ break;
+ case ENCAP_TYPE_IPPROTO:
+ protocol = skb->inner_protocol;
+ break;
+ }
+ nhdr = skb_inner_network_header(skb);
+ } else {
+ goto need_help;
+ }
+
+ switch (protocol) {
+ case IPPROTO_IP:
+ if (!spec->ipv4_okay)
+ goto need_help;
+ iph = nhdr;
+ ip_proto = iph->protocol;
+ if (iph->ihl != 5 && !spec->ip_options_okay)
+ goto need_help;
+ break;
+ case IPPROTO_IPV6:
+ if (!spec->ipv6_okay)
+ goto need_help;
+ if (spec->no_encapped_ipv6 && *csum_encapped)
+ goto need_help;
+ ipv6 = nhdr;
+ nhdr += sizeof(*ipv6);
+ ip_proto = ipv6->nexthdr;
+ break;
+ default:
+ goto need_help;
+ }
+
+ip_proto_again:
+ switch (ip_proto) {
+ case IPPROTO_TCP:
+ if (!spec->tcp_okay ||
+ skb->csum_offset != offsetof(struct tcphdr, check))
+ goto need_help;
+ break;
+ case IPPROTO_UDP:
+ if (!spec->udp_okay ||
+ skb->csum_offset != offsetof(struct udphdr, check))
+ goto need_help;
+ break;
+ case IPPROTO_SCTP:
+ if (!spec->sctp_okay ||
+ skb->csum_offset != offsetof(struct sctphdr, checksum))
+ goto cant_help;
+ break;
+ case NEXTHDR_HOP:
+ case NEXTHDR_ROUTING:
+ case NEXTHDR_DEST: {
+ u8 *opthdr = nhdr;
+
+ if (protocol != IPPROTO_IPV6 || !spec->ext_hdrs_okay)
+ goto need_help;
+
+ ip_proto = opthdr[0];
+ nhdr += (opthdr[1] + 1) << 3;
+
+ goto ip_proto_again;
+ }
+ default:
+ goto need_help;
+ }
+
+ /* Passed the tests for offloading checksum */
+ return true;
+
+need_help:
+ if (csum_help && !skb_shinfo(skb)->gso_size)
+ skb_checksum_help(skb);
+cant_help:
+ return false;
+}
+EXPORT_SYMBOL(__skb_csum_offload_chk);
+
__be16 skb_network_protocol(struct sk_buff *skb, int *depth)
{
__be16 type = skb->protocol;
@@ -2539,6 +2695,8 @@ static inline bool skb_needs_check(struct sk_buff *skb, bool tx_path)
*
* It may return NULL if the skb requires no segmentation. This is
* only possible when GSO is used for verifying header integrity.
+ *
+ * Segmentation preserves SKB_SGO_CB_OFFSET bytes of previous skb cb.
*/
struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
netdev_features_t features, bool tx_path)
@@ -2553,6 +2711,9 @@ struct sk_buff *__skb_gso_segment(struct sk_buff *skb,
return ERR_PTR(err);
}
+ BUILD_BUG_ON(SKB_SGO_CB_OFFSET +
+ sizeof(*SKB_GSO_CB(skb)) > sizeof(skb->cb));
+
SKB_GSO_CB(skb)->mac_offset = skb_headroom(skb);
SKB_GSO_CB(skb)->encap_level = 0;
@@ -2641,7 +2802,7 @@ static netdev_features_t harmonize_features(struct sk_buff *skb,
if (skb->ip_summed != CHECKSUM_NONE &&
!can_checksum_protocol(features, type)) {
- features &= ~NETIF_F_ALL_CSUM;
+ features &= ~NETIF_F_CSUM_MASK;
} else if (illegal_highdma(skb->dev, skb)) {
features &= ~NETIF_F_SG;
}
@@ -2788,7 +2949,7 @@ static struct sk_buff *validate_xmit_skb(struct sk_buff *skb, struct net_device
else
skb_set_transport_header(skb,
skb_checksum_start_offset(skb));
- if (!(features & NETIF_F_ALL_CSUM) &&
+ if (!(features & NETIF_F_CSUM_MASK) &&
skb_checksum_help(skb))
goto out_kfree_skb;
}
@@ -2867,7 +3028,6 @@ static inline int __dev_xmit_skb(struct sk_buff *skb, struct Qdisc *q,
bool contended;
int rc;
- qdisc_pkt_len_init(skb);
qdisc_calculate_pkt_len(skb, q);
/*
* Heuristic to force contended enqueues to serialize on a
@@ -2925,7 +3085,8 @@ static void skb_update_prio(struct sk_buff *skb)
struct netprio_map *map = rcu_dereference_bh(skb->dev->priomap);
if (!skb->priority && skb->sk && map) {
- unsigned int prioidx = skb->sk->sk_cgrp_prioidx;
+ unsigned int prioidx =
+ sock_cgroup_prioidx(&skb->sk->sk_cgrp_data);
if (prioidx < map->priomap_len)
skb->priority = map->priomap[prioidx];
@@ -2959,6 +3120,49 @@ int dev_loopback_xmit(struct net *net, struct sock *sk, struct sk_buff *skb)
}
EXPORT_SYMBOL(dev_loopback_xmit);
+#ifdef CONFIG_NET_EGRESS
+static struct sk_buff *
+sch_handle_egress(struct sk_buff *skb, int *ret, struct net_device *dev)
+{
+ struct tcf_proto *cl = rcu_dereference_bh(dev->egress_cl_list);
+ struct tcf_result cl_res;
+
+ if (!cl)
+ return skb;
+
+ /* skb->tc_verd and qdisc_skb_cb(skb)->pkt_len were already set
+ * earlier by the caller.
+ */
+ qdisc_bstats_cpu_update(cl->q, skb);
+
+ switch (tc_classify(skb, cl, &cl_res, false)) {
+ case TC_ACT_OK:
+ case TC_ACT_RECLASSIFY:
+ skb->tc_index = TC_H_MIN(cl_res.classid);
+ break;
+ case TC_ACT_SHOT:
+ qdisc_qstats_cpu_drop(cl->q);
+ *ret = NET_XMIT_DROP;
+ goto drop;
+ case TC_ACT_STOLEN:
+ case TC_ACT_QUEUED:
+ *ret = NET_XMIT_SUCCESS;
+drop:
+ kfree_skb(skb);
+ return NULL;
+ case TC_ACT_REDIRECT:
+ /* No need to push/pop skb's mac_header here on egress! */
+ skb_do_redirect(skb);
+ *ret = NET_XMIT_SUCCESS;
+ return NULL;
+ default:
+ break;
+ }
+
+ return skb;
+}
+#endif /* CONFIG_NET_EGRESS */
+
static inline int get_xps_queue(struct net_device *dev, struct sk_buff *skb)
{
#ifdef CONFIG_XPS
@@ -3018,7 +3222,9 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
int queue_index = 0;
#ifdef CONFIG_XPS
- if (skb->sender_cpu == 0)
+ u32 sender_cpu = skb->sender_cpu - 1;
+
+ if (sender_cpu >= (u32)NR_CPUS)
skb->sender_cpu = raw_smp_processor_id() + 1;
#endif
@@ -3083,6 +3289,17 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
skb_update_prio(skb);
+ qdisc_pkt_len_init(skb);
+#ifdef CONFIG_NET_CLS_ACT
+ skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
+# ifdef CONFIG_NET_EGRESS
+ if (static_key_false(&egress_needed)) {
+ skb = sch_handle_egress(skb, &rc, dev);
+ if (!skb)
+ goto out;
+ }
+# endif
+#endif
/* If device/qdisc don't need skb->dst, release it right now while
* its hot in this cpu cache.
*/
@@ -3104,9 +3321,6 @@ static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
txq = netdev_pick_tx(dev, skb, accel_priv);
q = rcu_dereference_bh(txq->qdisc);
-#ifdef CONFIG_NET_CLS_ACT
- skb->tc_verd = SET_TC_AT(skb->tc_verd, AT_EGRESS);
-#endif
trace_net_dev_queue(skb);
if (q->enqueue) {
rc = __dev_xmit_skb(skb, q, dev, txq);
@@ -3615,8 +3829,14 @@ static void net_tx_action(struct softirq_action *h)
trace_consume_skb(skb);
else
trace_kfree_skb(skb, net_tx_action);
- __kfree_skb(skb);
+
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE)
+ __kfree_skb(skb);
+ else
+ __kfree_skb_defer(skb);
}
+
+ __kfree_skb_flush();
}
if (sd->output_queue) {
@@ -3663,9 +3883,9 @@ int (*br_fdb_test_addr_hook)(struct net_device *dev,
EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
#endif
-static inline struct sk_buff *handle_ing(struct sk_buff *skb,
- struct packet_type **pt_prev,
- int *ret, struct net_device *orig_dev)
+static inline struct sk_buff *
+sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
+ struct net_device *orig_dev)
{
#ifdef CONFIG_NET_CLS_ACT
struct tcf_proto *cl = rcu_dereference_bh(skb->dev->ingress_cl_list);
@@ -3859,7 +4079,7 @@ another_round:
skip_taps:
#ifdef CONFIG_NET_INGRESS
if (static_key_false(&ingress_needed)) {
- skb = handle_ing(skb, &pt_prev, &ret, orig_dev);
+ skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev);
if (!skb)
goto out;
@@ -3940,7 +4160,10 @@ ncls:
ret = pt_prev->func(skb, skb->dev, pt_prev, orig_dev);
} else {
drop:
- atomic_long_inc(&skb->dev->rx_dropped);
+ if (!deliver_exact)
+ atomic_long_inc(&skb->dev->rx_dropped);
+ else
+ atomic_long_inc(&skb->dev->rx_nohandler);
kfree_skb(skb);
/* Jamal, now you will not able to escape explaining
* me how you were going to use this. :-)
@@ -4137,6 +4360,7 @@ static void gro_list_prepare(struct napi_struct *napi, struct sk_buff *skb)
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
diffs |= p->vlan_tci ^ skb->vlan_tci;
+ diffs |= skb_metadata_dst_cmp(p, skb);
if (maclen == ETH_HLEN)
diffs |= compare_ether_header(skb_mac_header(p),
skb_mac_header(skb));
@@ -4214,7 +4438,8 @@ static enum gro_result dev_gro_receive(struct napi_struct *napi, struct sk_buff
NAPI_GRO_CB(skb)->same_flow = 0;
NAPI_GRO_CB(skb)->flush = 0;
NAPI_GRO_CB(skb)->free = 0;
- NAPI_GRO_CB(skb)->udp_mark = 0;
+ NAPI_GRO_CB(skb)->encap_mark = 0;
+ NAPI_GRO_CB(skb)->is_fou = 0;
NAPI_GRO_CB(skb)->gro_remcsum_start = 0;
/* Setup for GRO checksum validation */
@@ -4334,10 +4559,12 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
break;
case GRO_MERGED_FREE:
- if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+ if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
+ skb_dst_drop(skb);
kmem_cache_free(skbuff_head_cache, skb);
- else
+ } else {
__kfree_skb(skb);
+ }
break;
case GRO_HELD:
@@ -4350,6 +4577,7 @@ static gro_result_t napi_skb_finish(gro_result_t ret, struct sk_buff *skb)
gro_result_t napi_gro_receive(struct napi_struct *napi, struct sk_buff *skb)
{
+ skb_mark_napi_id(skb, napi);
trace_napi_gro_receive_entry(skb);
skb_gro_reset_offset(skb);
@@ -4383,7 +4611,10 @@ struct sk_buff *napi_get_frags(struct napi_struct *napi)
if (!skb) {
skb = napi_alloc_skb(napi, GRO_MAX_HEAD);
- napi->skb = skb;
+ if (skb) {
+ napi->skb = skb;
+ skb_mark_napi_id(skb, napi);
+ }
}
return skb;
}
@@ -4658,7 +4889,7 @@ void napi_complete_done(struct napi_struct *n, int work_done)
EXPORT_SYMBOL(napi_complete_done);
/* must be called under rcu_read_lock(), as we dont take a reference */
-struct napi_struct *napi_by_id(unsigned int napi_id)
+static struct napi_struct *napi_by_id(unsigned int napi_id)
{
unsigned int hash = napi_id % HASH_SIZE(napi_hash);
struct napi_struct *napi;
@@ -4669,43 +4900,101 @@ struct napi_struct *napi_by_id(unsigned int napi_id)
return NULL;
}
-EXPORT_SYMBOL_GPL(napi_by_id);
-void napi_hash_add(struct napi_struct *napi)
+#if defined(CONFIG_NET_RX_BUSY_POLL)
+#define BUSY_POLL_BUDGET 8
+bool sk_busy_loop(struct sock *sk, int nonblock)
{
- if (!test_and_set_bit(NAPI_STATE_HASHED, &napi->state)) {
+ unsigned long end_time = !nonblock ? sk_busy_loop_end_time(sk) : 0;
+ int (*busy_poll)(struct napi_struct *dev);
+ struct napi_struct *napi;
+ int rc = false;
- spin_lock(&napi_hash_lock);
+ rcu_read_lock();
- /* 0 is not a valid id, we also skip an id that is taken
- * we expect both events to be extremely rare
- */
- napi->napi_id = 0;
- while (!napi->napi_id) {
- napi->napi_id = ++napi_gen_id;
- if (napi_by_id(napi->napi_id))
- napi->napi_id = 0;
+ napi = napi_by_id(sk->sk_napi_id);
+ if (!napi)
+ goto out;
+
+ /* Note: ndo_busy_poll method is optional in linux-4.5 */
+ busy_poll = napi->dev->netdev_ops->ndo_busy_poll;
+
+ do {
+ rc = 0;
+ local_bh_disable();
+ if (busy_poll) {
+ rc = busy_poll(napi);
+ } else if (napi_schedule_prep(napi)) {
+ void *have = netpoll_poll_lock(napi);
+
+ if (test_bit(NAPI_STATE_SCHED, &napi->state)) {
+ rc = napi->poll(napi, BUSY_POLL_BUDGET);
+ trace_napi_poll(napi);
+ if (rc == BUSY_POLL_BUDGET) {
+ napi_complete_done(napi, rc);
+ napi_schedule(napi);
+ }
+ }
+ netpoll_poll_unlock(have);
}
+ if (rc > 0)
+ NET_ADD_STATS_BH(sock_net(sk),
+ LINUX_MIB_BUSYPOLLRXPACKETS, rc);
+ local_bh_enable();
- hlist_add_head_rcu(&napi->napi_hash_node,
- &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+ if (rc == LL_FLUSH_FAILED)
+ break; /* permanent failure */
- spin_unlock(&napi_hash_lock);
- }
+ cpu_relax();
+ } while (!nonblock && skb_queue_empty(&sk->sk_receive_queue) &&
+ !need_resched() && !busy_loop_timeout(end_time));
+
+ rc = !skb_queue_empty(&sk->sk_receive_queue);
+out:
+ rcu_read_unlock();
+ return rc;
+}
+EXPORT_SYMBOL(sk_busy_loop);
+
+#endif /* CONFIG_NET_RX_BUSY_POLL */
+
+void napi_hash_add(struct napi_struct *napi)
+{
+ if (test_bit(NAPI_STATE_NO_BUSY_POLL, &napi->state) ||
+ test_and_set_bit(NAPI_STATE_HASHED, &napi->state))
+ return;
+
+ spin_lock(&napi_hash_lock);
+
+ /* 0..NR_CPUS+1 range is reserved for sender_cpu use */
+ do {
+ if (unlikely(++napi_gen_id < NR_CPUS + 1))
+ napi_gen_id = NR_CPUS + 1;
+ } while (napi_by_id(napi_gen_id));
+ napi->napi_id = napi_gen_id;
+
+ hlist_add_head_rcu(&napi->napi_hash_node,
+ &napi_hash[napi->napi_id % HASH_SIZE(napi_hash)]);
+
+ spin_unlock(&napi_hash_lock);
}
EXPORT_SYMBOL_GPL(napi_hash_add);
/* Warning : caller is responsible to make sure rcu grace period
* is respected before freeing memory containing @napi
*/
-void napi_hash_del(struct napi_struct *napi)
+bool napi_hash_del(struct napi_struct *napi)
{
+ bool rcu_sync_needed = false;
+
spin_lock(&napi_hash_lock);
- if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state))
+ if (test_and_clear_bit(NAPI_STATE_HASHED, &napi->state)) {
+ rcu_sync_needed = true;
hlist_del_rcu(&napi->napi_hash_node);
-
+ }
spin_unlock(&napi_hash_lock);
+ return rcu_sync_needed;
}
EXPORT_SYMBOL_GPL(napi_hash_del);
@@ -4741,6 +5030,7 @@ void netif_napi_add(struct net_device *dev, struct napi_struct *napi,
napi->poll_owner = -1;
#endif
set_bit(NAPI_STATE_SCHED, &napi->state);
+ napi_hash_add(napi);
}
EXPORT_SYMBOL(netif_napi_add);
@@ -4760,8 +5050,12 @@ void napi_disable(struct napi_struct *n)
}
EXPORT_SYMBOL(napi_disable);
+/* Must be called in process context */
void netif_napi_del(struct napi_struct *napi)
{
+ might_sleep();
+ if (napi_hash_del(napi))
+ synchronize_net();
list_del_init(&napi->dev_list);
napi_free_frags(napi);
@@ -4868,6 +5162,7 @@ static void net_rx_action(struct softirq_action *h)
}
}
+ __kfree_skb_flush();
local_irq_disable();
list_splice_tail_init(&sd->poll_list, &list);
@@ -5095,12 +5390,12 @@ void *netdev_lower_get_next(struct net_device *dev, struct list_head **iter)
{
struct netdev_adjacent *lower;
- lower = list_entry((*iter)->next, struct netdev_adjacent, list);
+ lower = list_entry(*iter, struct netdev_adjacent, list);
if (&lower->list == &dev->adj_list.lower)
return NULL;
- *iter = &lower->list;
+ *iter = lower->list.next;
return lower->dev;
}
@@ -5348,7 +5643,7 @@ static void __netdev_adjacent_dev_unlink_neighbour(struct net_device *dev,
static int __netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev, bool master,
- void *private)
+ void *upper_priv, void *upper_info)
{
struct netdev_notifier_changeupper_info changeupper_info;
struct netdev_adjacent *i, *j, *to_i, *to_j;
@@ -5372,6 +5667,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
changeupper_info.upper_dev = upper_dev;
changeupper_info.master = master;
changeupper_info.linking = true;
+ changeupper_info.upper_info = upper_info;
ret = call_netdevice_notifiers_info(NETDEV_PRECHANGEUPPER, dev,
&changeupper_info.info);
@@ -5379,7 +5675,7 @@ static int __netdev_upper_dev_link(struct net_device *dev,
if (ret)
return ret;
- ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, private,
+ ret = __netdev_adjacent_dev_link_neighbour(dev, upper_dev, upper_priv,
master);
if (ret)
return ret;
@@ -5417,8 +5713,12 @@ static int __netdev_upper_dev_link(struct net_device *dev,
goto rollback_lower_mesh;
}
- call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
- &changeupper_info.info);
+ ret = call_netdevice_notifiers_info(NETDEV_CHANGEUPPER, dev,
+ &changeupper_info.info);
+ ret = notifier_to_errno(ret);
+ if (ret)
+ goto rollback_lower_mesh;
+
return 0;
rollback_lower_mesh:
@@ -5472,7 +5772,7 @@ rollback_mesh:
int netdev_upper_dev_link(struct net_device *dev,
struct net_device *upper_dev)
{
- return __netdev_upper_dev_link(dev, upper_dev, false, NULL);
+ return __netdev_upper_dev_link(dev, upper_dev, false, NULL, NULL);
}
EXPORT_SYMBOL(netdev_upper_dev_link);
@@ -5480,6 +5780,8 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
* netdev_master_upper_dev_link - Add a master link to the upper device
* @dev: device
* @upper_dev: new upper device
+ * @upper_priv: upper device private
+ * @upper_info: upper info to be passed down via notifier
*
* Adds a link to device which is upper to this one. In this case, only
* one master upper device can be linked, although other non-master devices
@@ -5488,20 +5790,14 @@ EXPORT_SYMBOL(netdev_upper_dev_link);
* counts are adjusted and the function returns zero.
*/
int netdev_master_upper_dev_link(struct net_device *dev,
- struct net_device *upper_dev)
+ struct net_device *upper_dev,
+ void *upper_priv, void *upper_info)
{
- return __netdev_upper_dev_link(dev, upper_dev, true, NULL);
+ return __netdev_upper_dev_link(dev, upper_dev, true,
+ upper_priv, upper_info);
}
EXPORT_SYMBOL(netdev_master_upper_dev_link);
-int netdev_master_upper_dev_link_private(struct net_device *dev,
- struct net_device *upper_dev,
- void *private)
-{
- return __netdev_upper_dev_link(dev, upper_dev, true, private);
-}
-EXPORT_SYMBOL(netdev_master_upper_dev_link_private);
-
/**
* netdev_upper_dev_unlink - Removes a link to upper device
* @dev: device
@@ -5660,7 +5956,7 @@ EXPORT_SYMBOL(netdev_lower_dev_get_private);
int dev_get_nest_level(struct net_device *dev,
- bool (*type_check)(struct net_device *dev))
+ bool (*type_check)(const struct net_device *dev))
{
struct net_device *lower = NULL;
struct list_head *iter;
@@ -5682,6 +5978,26 @@ int dev_get_nest_level(struct net_device *dev,
}
EXPORT_SYMBOL(dev_get_nest_level);
+/**
+ * netdev_lower_change - Dispatch event about lower device state change
+ * @lower_dev: device
+ * @lower_state_info: state to dispatch
+ *
+ * Send NETDEV_CHANGELOWERSTATE to netdev notifiers with info.
+ * The caller must hold the RTNL lock.
+ */
+void netdev_lower_state_changed(struct net_device *lower_dev,
+ void *lower_state_info)
+{
+ struct netdev_notifier_changelowerstate_info changelowerstate_info;
+
+ ASSERT_RTNL();
+ changelowerstate_info.lower_state_info = lower_state_info;
+ call_netdevice_notifiers_info(NETDEV_CHANGELOWERSTATE, lower_dev,
+ &changelowerstate_info.info);
+}
+EXPORT_SYMBOL(netdev_lower_state_changed);
+
static void dev_change_rx_flags(struct net_device *dev, int flags)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -6130,6 +6446,7 @@ EXPORT_SYMBOL(dev_get_phys_port_id);
* dev_get_phys_port_name - Get device physical port name
* @dev: device
* @name: port name
+ * @len: limit of bytes to copy to name
*
* Get device physical port name
*/
@@ -6372,9 +6689,9 @@ static netdev_features_t netdev_fix_features(struct net_device *dev,
/* UFO needs SG and checksumming */
if (features & NETIF_F_UFO) {
/* maybe split UFO into V4 and V6? */
- if (!((features & NETIF_F_GEN_CSUM) ||
- (features & (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))
- == (NETIF_F_IP_CSUM|NETIF_F_IPV6_CSUM))) {
+ if (!(features & NETIF_F_HW_CSUM) &&
+ ((features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) !=
+ (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM))) {
netdev_dbg(dev,
"Dropping NETIF_F_UFO since no checksum offload features.\n");
features &= ~NETIF_F_UFO;
@@ -6402,7 +6719,7 @@ int __netdev_update_features(struct net_device *dev)
struct net_device *upper, *lower;
netdev_features_t features;
struct list_head *iter;
- int err = 0;
+ int err = -1;
ASSERT_RTNL();
@@ -6419,21 +6736,27 @@ int __netdev_update_features(struct net_device *dev)
features = netdev_sync_upper_features(dev, upper, features);
if (dev->features == features)
- return 0;
+ goto sync_lower;
netdev_dbg(dev, "Features changed: %pNF -> %pNF\n",
&dev->features, &features);
if (dev->netdev_ops->ndo_set_features)
err = dev->netdev_ops->ndo_set_features(dev, features);
+ else
+ err = 0;
if (unlikely(err < 0)) {
netdev_err(dev,
"set_features() failed (%d); wanted %pNF, left %pNF\n",
err, &features, &dev->features);
+ /* return non-0 since some features might have changed and
+ * it's better to fire a spurious notification than miss it
+ */
return -1;
}
+sync_lower:
/* some features must be disabled on lower devices when disabled
* on an upper device (think: bonding master or bridge)
*/
@@ -6443,7 +6766,7 @@ int __netdev_update_features(struct net_device *dev)
if (!err)
dev->features = features;
- return 1;
+ return err < 0 ? 0 : 1;
}
/**
@@ -6942,24 +7265,31 @@ void netdev_run_todo(void)
}
}
-/* Convert net_device_stats to rtnl_link_stats64. They have the same
- * fields in the same order, with only the type differing.
+/* Convert net_device_stats to rtnl_link_stats64. rtnl_link_stats64 has
+ * all the same fields in the same order as net_device_stats, with only
+ * the type differing, but rtnl_link_stats64 may have additional fields
+ * at the end for newer counters.
*/
void netdev_stats_to_stats64(struct rtnl_link_stats64 *stats64,
const struct net_device_stats *netdev_stats)
{
#if BITS_PER_LONG == 64
- BUILD_BUG_ON(sizeof(*stats64) != sizeof(*netdev_stats));
+ BUILD_BUG_ON(sizeof(*stats64) < sizeof(*netdev_stats));
memcpy(stats64, netdev_stats, sizeof(*stats64));
+ /* zero out counters that only exist in rtnl_link_stats64 */
+ memset((char *)stats64 + sizeof(*netdev_stats), 0,
+ sizeof(*stats64) - sizeof(*netdev_stats));
#else
- size_t i, n = sizeof(*stats64) / sizeof(u64);
+ size_t i, n = sizeof(*netdev_stats) / sizeof(unsigned long);
const unsigned long *src = (const unsigned long *)netdev_stats;
u64 *dst = (u64 *)stats64;
- BUILD_BUG_ON(sizeof(*netdev_stats) / sizeof(unsigned long) !=
- sizeof(*stats64) / sizeof(u64));
+ BUILD_BUG_ON(n > sizeof(*stats64) / sizeof(u64));
for (i = 0; i < n; i++)
dst[i] = src[i];
+ /* zero out counters that only exist in rtnl_link_stats64 */
+ memset((char *)stats64 + n * sizeof(u64), 0,
+ sizeof(*stats64) - n * sizeof(u64));
#endif
}
EXPORT_SYMBOL(netdev_stats_to_stats64);
@@ -6989,6 +7319,7 @@ struct rtnl_link_stats64 *dev_get_stats(struct net_device *dev,
}
storage->rx_dropped += atomic_long_read(&dev->rx_dropped);
storage->tx_dropped += atomic_long_read(&dev->tx_dropped);
+ storage->rx_nohandler += atomic_long_read(&dev->rx_nohandler);
return storage;
}
EXPORT_SYMBOL(dev_get_stats);
@@ -7111,8 +7442,10 @@ struct net_device *alloc_netdev_mqs(int sizeof_priv, const char *name,
dev->priv_flags = IFF_XMIT_DST_RELEASE | IFF_XMIT_DST_RELEASE_PERM;
setup(dev);
- if (!dev->tx_queue_len)
+ if (!dev->tx_queue_len) {
dev->priv_flags |= IFF_NO_QUEUE;
+ dev->tx_queue_len = 1;
+ }
dev->num_tx_queues = txqs;
dev->real_num_tx_queues = txqs;
@@ -7155,11 +7488,13 @@ EXPORT_SYMBOL(alloc_netdev_mqs);
* This function does the last stage of destroying an allocated device
* interface. The reference to the device object is released.
* If this is the last reference then it will be freed.
+ * Must be called in process context.
*/
void free_netdev(struct net_device *dev)
{
struct napi_struct *p, *n;
+ might_sleep();
netif_free_tx_queues(dev);
#ifdef CONFIG_SYSFS
kvfree(dev->_rx);
@@ -7468,16 +7803,16 @@ static int dev_cpu_callback(struct notifier_block *nfb,
netdev_features_t netdev_increment_features(netdev_features_t all,
netdev_features_t one, netdev_features_t mask)
{
- if (mask & NETIF_F_GEN_CSUM)
- mask |= NETIF_F_ALL_CSUM;
+ if (mask & NETIF_F_HW_CSUM)
+ mask |= NETIF_F_CSUM_MASK;
mask |= NETIF_F_VLAN_CHALLENGED;
- all |= one & (NETIF_F_ONE_FOR_ALL|NETIF_F_ALL_CSUM) & mask;
+ all |= one & (NETIF_F_ONE_FOR_ALL | NETIF_F_CSUM_MASK) & mask;
all &= one | ~NETIF_F_ALL_FOR_ALL;
/* If one device supports hw checksumming, set for all. */
- if (all & NETIF_F_GEN_CSUM)
- all &= ~(NETIF_F_ALL_CSUM & ~NETIF_F_GEN_CSUM);
+ if (all & NETIF_F_HW_CSUM)
+ all &= ~(NETIF_F_CSUM_MASK & ~NETIF_F_HW_CSUM);
return all;
}
diff --git a/net/core/devlink.c b/net/core/devlink.c
new file mode 100644
index 000000000000..590fa561cb7f
--- /dev/null
+++ b/net/core/devlink.c
@@ -0,0 +1,738 @@
+/*
+ * net/core/devlink.c - Network physical/parent device Netlink interface
+ *
+ * Heavily inspired by net/wireless/
+ * Copyright (c) 2016 Mellanox Technologies. All rights reserved.
+ * Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/types.h>
+#include <linux/slab.h>
+#include <linux/gfp.h>
+#include <linux/device.h>
+#include <linux/list.h>
+#include <linux/netdevice.h>
+#include <rdma/ib_verbs.h>
+#include <net/netlink.h>
+#include <net/genetlink.h>
+#include <net/rtnetlink.h>
+#include <net/net_namespace.h>
+#include <net/sock.h>
+#include <net/devlink.h>
+
+static LIST_HEAD(devlink_list);
+
+/* devlink_mutex
+ *
+ * An overall lock guarding every operation coming from userspace.
+ * It also guards devlink devices list and it is taken when
+ * driver registers/unregisters it.
+ */
+static DEFINE_MUTEX(devlink_mutex);
+
+/* devlink_port_mutex
+ *
+ * Shared lock to guard lists of ports in all devlink devices.
+ */
+static DEFINE_MUTEX(devlink_port_mutex);
+
+static struct net *devlink_net(const struct devlink *devlink)
+{
+ return read_pnet(&devlink->_net);
+}
+
+static void devlink_net_set(struct devlink *devlink, struct net *net)
+{
+ write_pnet(&devlink->_net, net);
+}
+
+static struct devlink *devlink_get_from_attrs(struct net *net,
+ struct nlattr **attrs)
+{
+ struct devlink *devlink;
+ char *busname;
+ char *devname;
+
+ if (!attrs[DEVLINK_ATTR_BUS_NAME] || !attrs[DEVLINK_ATTR_DEV_NAME])
+ return ERR_PTR(-EINVAL);
+
+ busname = nla_data(attrs[DEVLINK_ATTR_BUS_NAME]);
+ devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]);
+
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (strcmp(devlink->dev->bus->name, busname) == 0 &&
+ strcmp(dev_name(devlink->dev), devname) == 0 &&
+ net_eq(devlink_net(devlink), net))
+ return devlink;
+ }
+
+ return ERR_PTR(-ENODEV);
+}
+
+static struct devlink *devlink_get_from_info(struct genl_info *info)
+{
+ return devlink_get_from_attrs(genl_info_net(info), info->attrs);
+}
+
+static struct devlink_port *devlink_port_get_by_index(struct devlink *devlink,
+ int port_index)
+{
+ struct devlink_port *devlink_port;
+
+ list_for_each_entry(devlink_port, &devlink->port_list, list) {
+ if (devlink_port->index == port_index)
+ return devlink_port;
+ }
+ return NULL;
+}
+
+static bool devlink_port_index_exists(struct devlink *devlink, int port_index)
+{
+ return devlink_port_get_by_index(devlink, port_index);
+}
+
+static struct devlink_port *devlink_port_get_from_attrs(struct devlink *devlink,
+ struct nlattr **attrs)
+{
+ if (attrs[DEVLINK_ATTR_PORT_INDEX]) {
+ u32 port_index = nla_get_u32(attrs[DEVLINK_ATTR_PORT_INDEX]);
+ struct devlink_port *devlink_port;
+
+ devlink_port = devlink_port_get_by_index(devlink, port_index);
+ if (!devlink_port)
+ return ERR_PTR(-ENODEV);
+ return devlink_port;
+ }
+ return ERR_PTR(-EINVAL);
+}
+
+static struct devlink_port *devlink_port_get_from_info(struct devlink *devlink,
+ struct genl_info *info)
+{
+ return devlink_port_get_from_attrs(devlink, info->attrs);
+}
+
+#define DEVLINK_NL_FLAG_NEED_PORT BIT(0)
+
+static int devlink_nl_pre_doit(const struct genl_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink;
+
+ mutex_lock(&devlink_mutex);
+ devlink = devlink_get_from_info(info);
+ if (IS_ERR(devlink)) {
+ mutex_unlock(&devlink_mutex);
+ return PTR_ERR(devlink);
+ }
+ info->user_ptr[0] = devlink;
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT) {
+ struct devlink_port *devlink_port;
+
+ mutex_lock(&devlink_port_mutex);
+ devlink_port = devlink_port_get_from_info(devlink, info);
+ if (IS_ERR(devlink_port)) {
+ mutex_unlock(&devlink_port_mutex);
+ mutex_unlock(&devlink_mutex);
+ return PTR_ERR(devlink_port);
+ }
+ info->user_ptr[1] = devlink_port;
+ }
+ return 0;
+}
+
+static void devlink_nl_post_doit(const struct genl_ops *ops,
+ struct sk_buff *skb, struct genl_info *info)
+{
+ if (ops->internal_flags & DEVLINK_NL_FLAG_NEED_PORT)
+ mutex_unlock(&devlink_port_mutex);
+ mutex_unlock(&devlink_mutex);
+}
+
+static struct genl_family devlink_nl_family = {
+ .id = GENL_ID_GENERATE,
+ .name = DEVLINK_GENL_NAME,
+ .version = DEVLINK_GENL_VERSION,
+ .maxattr = DEVLINK_ATTR_MAX,
+ .netnsok = true,
+ .pre_doit = devlink_nl_pre_doit,
+ .post_doit = devlink_nl_post_doit,
+};
+
+enum devlink_multicast_groups {
+ DEVLINK_MCGRP_CONFIG,
+};
+
+static const struct genl_multicast_group devlink_nl_mcgrps[] = {
+ [DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME },
+};
+
+static int devlink_nl_put_handle(struct sk_buff *msg, struct devlink *devlink)
+{
+ if (nla_put_string(msg, DEVLINK_ATTR_BUS_NAME, devlink->dev->bus->name))
+ return -EMSGSIZE;
+ if (nla_put_string(msg, DEVLINK_ATTR_DEV_NAME, dev_name(devlink->dev)))
+ return -EMSGSIZE;
+ return 0;
+}
+
+static int devlink_nl_fill(struct sk_buff *msg, struct devlink *devlink,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_notify(struct devlink *devlink, enum devlink_command cmd)
+{
+ struct sk_buff *msg;
+ int err;
+
+ WARN_ON(cmd != DEVLINK_CMD_NEW && cmd != DEVLINK_CMD_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_fill(msg, devlink, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_port_fill(struct sk_buff *msg, struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ enum devlink_command cmd, u32 portid,
+ u32 seq, int flags)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(msg, portid, seq, &devlink_nl_family, flags, cmd);
+ if (!hdr)
+ return -EMSGSIZE;
+
+ if (devlink_nl_put_handle(msg, devlink))
+ goto nla_put_failure;
+ if (nla_put_u32(msg, DEVLINK_ATTR_PORT_INDEX, devlink_port->index))
+ goto nla_put_failure;
+ if (nla_put_u16(msg, DEVLINK_ATTR_PORT_TYPE, devlink_port->type))
+ goto nla_put_failure;
+ if (devlink_port->desired_type != DEVLINK_PORT_TYPE_NOTSET &&
+ nla_put_u16(msg, DEVLINK_ATTR_PORT_DESIRED_TYPE,
+ devlink_port->desired_type))
+ goto nla_put_failure;
+ if (devlink_port->type == DEVLINK_PORT_TYPE_ETH) {
+ struct net_device *netdev = devlink_port->type_dev;
+
+ if (netdev &&
+ (nla_put_u32(msg, DEVLINK_ATTR_PORT_NETDEV_IFINDEX,
+ netdev->ifindex) ||
+ nla_put_string(msg, DEVLINK_ATTR_PORT_NETDEV_NAME,
+ netdev->name)))
+ goto nla_put_failure;
+ }
+ if (devlink_port->type == DEVLINK_PORT_TYPE_IB) {
+ struct ib_device *ibdev = devlink_port->type_dev;
+
+ if (ibdev &&
+ nla_put_string(msg, DEVLINK_ATTR_PORT_IBDEV_NAME,
+ ibdev->name))
+ goto nla_put_failure;
+ }
+ if (devlink_port->split &&
+ nla_put_u32(msg, DEVLINK_ATTR_PORT_SPLIT_GROUP,
+ devlink_port->split_group))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+ return -EMSGSIZE;
+}
+
+static void devlink_port_notify(struct devlink_port *devlink_port,
+ enum devlink_command cmd)
+{
+ struct devlink *devlink = devlink_port->devlink;
+ struct sk_buff *msg;
+ int err;
+
+ if (!devlink_port->registered)
+ return;
+
+ WARN_ON(cmd != DEVLINK_CMD_PORT_NEW && cmd != DEVLINK_CMD_PORT_DEL);
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return;
+
+ err = devlink_nl_port_fill(msg, devlink, devlink_port, cmd, 0, 0, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return;
+ }
+
+ genlmsg_multicast_netns(&devlink_nl_family, devlink_net(devlink),
+ msg, 0, DEVLINK_MCGRP_CONFIG, GFP_KERNEL);
+}
+
+static int devlink_nl_cmd_get_doit(struct sk_buff *skb, struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_fill(msg, devlink, DEVLINK_CMD_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI);
+ if (err)
+ goto out;
+ idx++;
+ }
+out:
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_nl_cmd_port_get_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ struct sk_buff *msg;
+ int err;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ err = devlink_nl_port_fill(msg, devlink, devlink_port,
+ DEVLINK_CMD_PORT_NEW,
+ info->snd_portid, info->snd_seq, 0);
+ if (err) {
+ nlmsg_free(msg);
+ return err;
+ }
+
+ return genlmsg_reply(msg, info);
+}
+
+static int devlink_nl_cmd_port_get_dumpit(struct sk_buff *msg,
+ struct netlink_callback *cb)
+{
+ struct devlink *devlink;
+ struct devlink_port *devlink_port;
+ int start = cb->args[0];
+ int idx = 0;
+ int err;
+
+ mutex_lock(&devlink_mutex);
+ mutex_lock(&devlink_port_mutex);
+ list_for_each_entry(devlink, &devlink_list, list) {
+ if (!net_eq(devlink_net(devlink), sock_net(msg->sk)))
+ continue;
+ list_for_each_entry(devlink_port, &devlink->port_list, list) {
+ if (idx < start) {
+ idx++;
+ continue;
+ }
+ err = devlink_nl_port_fill(msg, devlink, devlink_port,
+ DEVLINK_CMD_NEW,
+ NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ NLM_F_MULTI);
+ if (err)
+ goto out;
+ idx++;
+ }
+ }
+out:
+ mutex_unlock(&devlink_port_mutex);
+ mutex_unlock(&devlink_mutex);
+
+ cb->args[0] = idx;
+ return msg->len;
+}
+
+static int devlink_port_type_set(struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ enum devlink_port_type port_type)
+
+{
+ int err;
+
+ if (devlink->ops && devlink->ops->port_type_set) {
+ if (port_type == DEVLINK_PORT_TYPE_NOTSET)
+ return -EINVAL;
+ err = devlink->ops->port_type_set(devlink_port, port_type);
+ if (err)
+ return err;
+ devlink_port->desired_type = port_type;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ return 0;
+ }
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_set_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ struct devlink_port *devlink_port = info->user_ptr[1];
+ int err;
+
+ if (info->attrs[DEVLINK_ATTR_PORT_TYPE]) {
+ enum devlink_port_type port_type;
+
+ port_type = nla_get_u16(info->attrs[DEVLINK_ATTR_PORT_TYPE]);
+ err = devlink_port_type_set(devlink, devlink_port, port_type);
+ if (err)
+ return err;
+ }
+ return 0;
+}
+
+static int devlink_port_split(struct devlink *devlink,
+ u32 port_index, u32 count)
+
+{
+ if (devlink->ops && devlink->ops->port_split)
+ return devlink->ops->port_split(devlink, port_index, count);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_split_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ u32 port_index;
+ u32 count;
+
+ if (!info->attrs[DEVLINK_ATTR_PORT_INDEX] ||
+ !info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT])
+ return -EINVAL;
+
+ port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+ count = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_SPLIT_COUNT]);
+ return devlink_port_split(devlink, port_index, count);
+}
+
+static int devlink_port_unsplit(struct devlink *devlink, u32 port_index)
+
+{
+ if (devlink->ops && devlink->ops->port_unsplit)
+ return devlink->ops->port_unsplit(devlink, port_index);
+ return -EOPNOTSUPP;
+}
+
+static int devlink_nl_cmd_port_unsplit_doit(struct sk_buff *skb,
+ struct genl_info *info)
+{
+ struct devlink *devlink = info->user_ptr[0];
+ u32 port_index;
+
+ if (!info->attrs[DEVLINK_ATTR_PORT_INDEX])
+ return -EINVAL;
+
+ port_index = nla_get_u32(info->attrs[DEVLINK_ATTR_PORT_INDEX]);
+ return devlink_port_unsplit(devlink, port_index);
+}
+
+static const struct nla_policy devlink_nl_policy[DEVLINK_ATTR_MAX + 1] = {
+ [DEVLINK_ATTR_BUS_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_DEV_NAME] = { .type = NLA_NUL_STRING },
+ [DEVLINK_ATTR_PORT_INDEX] = { .type = NLA_U32 },
+ [DEVLINK_ATTR_PORT_TYPE] = { .type = NLA_U16 },
+ [DEVLINK_ATTR_PORT_SPLIT_COUNT] = { .type = NLA_U32 },
+};
+
+static const struct genl_ops devlink_nl_ops[] = {
+ {
+ .cmd = DEVLINK_CMD_GET,
+ .doit = devlink_nl_cmd_get_doit,
+ .dumpit = devlink_nl_cmd_get_dumpit,
+ .policy = devlink_nl_policy,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_GET,
+ .doit = devlink_nl_cmd_port_get_doit,
+ .dumpit = devlink_nl_cmd_port_get_dumpit,
+ .policy = devlink_nl_policy,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+ /* can be retrieved by unprivileged users */
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_SET,
+ .doit = devlink_nl_cmd_port_set_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = DEVLINK_NL_FLAG_NEED_PORT,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_SPLIT,
+ .doit = devlink_nl_cmd_port_split_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = DEVLINK_CMD_PORT_UNSPLIT,
+ .doit = devlink_nl_cmd_port_unsplit_doit,
+ .policy = devlink_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+};
+
+/**
+ * devlink_alloc - Allocate new devlink instance resources
+ *
+ * @ops: ops
+ * @priv_size: size of user private data
+ *
+ * Allocate new devlink instance resources, including devlink index
+ * and name.
+ */
+struct devlink *devlink_alloc(const struct devlink_ops *ops, size_t priv_size)
+{
+ struct devlink *devlink;
+
+ devlink = kzalloc(sizeof(*devlink) + priv_size, GFP_KERNEL);
+ if (!devlink)
+ return NULL;
+ devlink->ops = ops;
+ devlink_net_set(devlink, &init_net);
+ INIT_LIST_HEAD(&devlink->port_list);
+ return devlink;
+}
+EXPORT_SYMBOL_GPL(devlink_alloc);
+
+/**
+ * devlink_register - Register devlink instance
+ *
+ * @devlink: devlink
+ */
+int devlink_register(struct devlink *devlink, struct device *dev)
+{
+ mutex_lock(&devlink_mutex);
+ devlink->dev = dev;
+ list_add_tail(&devlink->list, &devlink_list);
+ devlink_notify(devlink, DEVLINK_CMD_NEW);
+ mutex_unlock(&devlink_mutex);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_register);
+
+/**
+ * devlink_unregister - Unregister devlink instance
+ *
+ * @devlink: devlink
+ */
+void devlink_unregister(struct devlink *devlink)
+{
+ mutex_lock(&devlink_mutex);
+ devlink_notify(devlink, DEVLINK_CMD_DEL);
+ list_del(&devlink->list);
+ mutex_unlock(&devlink_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_unregister);
+
+/**
+ * devlink_free - Free devlink instance resources
+ *
+ * @devlink: devlink
+ */
+void devlink_free(struct devlink *devlink)
+{
+ kfree(devlink);
+}
+EXPORT_SYMBOL_GPL(devlink_free);
+
+/**
+ * devlink_port_register - Register devlink port
+ *
+ * @devlink: devlink
+ * @devlink_port: devlink port
+ * @port_index
+ *
+ * Register devlink port with provided port index. User can use
+ * any indexing, even hw-related one. devlink_port structure
+ * is convenient to be embedded inside user driver private structure.
+ * Note that the caller should take care of zeroing the devlink_port
+ * structure.
+ */
+int devlink_port_register(struct devlink *devlink,
+ struct devlink_port *devlink_port,
+ unsigned int port_index)
+{
+ mutex_lock(&devlink_port_mutex);
+ if (devlink_port_index_exists(devlink, port_index)) {
+ mutex_unlock(&devlink_port_mutex);
+ return -EEXIST;
+ }
+ devlink_port->devlink = devlink;
+ devlink_port->index = port_index;
+ devlink_port->type = DEVLINK_PORT_TYPE_NOTSET;
+ devlink_port->registered = true;
+ list_add_tail(&devlink_port->list, &devlink->port_list);
+ mutex_unlock(&devlink_port_mutex);
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(devlink_port_register);
+
+/**
+ * devlink_port_unregister - Unregister devlink port
+ *
+ * @devlink_port: devlink port
+ */
+void devlink_port_unregister(struct devlink_port *devlink_port)
+{
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_DEL);
+ mutex_lock(&devlink_port_mutex);
+ list_del(&devlink_port->list);
+ mutex_unlock(&devlink_port_mutex);
+}
+EXPORT_SYMBOL_GPL(devlink_port_unregister);
+
+static void __devlink_port_type_set(struct devlink_port *devlink_port,
+ enum devlink_port_type type,
+ void *type_dev)
+{
+ devlink_port->type = type;
+ devlink_port->type_dev = type_dev;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+
+/**
+ * devlink_port_type_eth_set - Set port type to Ethernet
+ *
+ * @devlink_port: devlink port
+ * @netdev: related netdevice
+ */
+void devlink_port_type_eth_set(struct devlink_port *devlink_port,
+ struct net_device *netdev)
+{
+ return __devlink_port_type_set(devlink_port,
+ DEVLINK_PORT_TYPE_ETH, netdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_eth_set);
+
+/**
+ * devlink_port_type_ib_set - Set port type to InfiniBand
+ *
+ * @devlink_port: devlink port
+ * @ibdev: related IB device
+ */
+void devlink_port_type_ib_set(struct devlink_port *devlink_port,
+ struct ib_device *ibdev)
+{
+ return __devlink_port_type_set(devlink_port,
+ DEVLINK_PORT_TYPE_IB, ibdev);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_ib_set);
+
+/**
+ * devlink_port_type_clear - Clear port type
+ *
+ * @devlink_port: devlink port
+ */
+void devlink_port_type_clear(struct devlink_port *devlink_port)
+{
+ return __devlink_port_type_set(devlink_port,
+ DEVLINK_PORT_TYPE_NOTSET, NULL);
+}
+EXPORT_SYMBOL_GPL(devlink_port_type_clear);
+
+/**
+ * devlink_port_split_set - Set port is split
+ *
+ * @devlink_port: devlink port
+ * @split_group: split group - identifies group split port is part of
+ */
+void devlink_port_split_set(struct devlink_port *devlink_port,
+ u32 split_group)
+{
+ devlink_port->split = true;
+ devlink_port->split_group = split_group;
+ devlink_port_notify(devlink_port, DEVLINK_CMD_PORT_NEW);
+}
+EXPORT_SYMBOL_GPL(devlink_port_split_set);
+
+static int __init devlink_module_init(void)
+{
+ return genl_register_family_with_ops_groups(&devlink_nl_family,
+ devlink_nl_ops,
+ devlink_nl_mcgrps);
+}
+
+static void __exit devlink_module_exit(void)
+{
+ genl_unregister_family(&devlink_nl_family);
+}
+
+module_init(devlink_module_init);
+module_exit(devlink_module_exit);
+
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR("Jiri Pirko <jiri@mellanox.com>");
+MODULE_DESCRIPTION("Network physical device Netlink interface");
+MODULE_ALIAS_GENL_FAMILY(DEVLINK_GENL_NAME);
diff --git a/net/core/dst.c b/net/core/dst.c
index 2a1818065e12..b5cbbe07f786 100644
--- a/net/core/dst.c
+++ b/net/core/dst.c
@@ -265,7 +265,7 @@ again:
lwtstate_put(dst->lwtstate);
if (dst->flags & DST_METADATA)
- kfree(dst);
+ metadata_dst_free((struct metadata_dst *)dst);
else
kmem_cache_free(dst->ops->kmem_cachep, dst);
@@ -301,12 +301,13 @@ void dst_release(struct dst_entry *dst)
{
if (dst) {
int newrefcnt;
+ unsigned short nocache = dst->flags & DST_NOCACHE;
newrefcnt = atomic_dec_return(&dst->__refcnt);
if (unlikely(newrefcnt < 0))
net_warn_ratelimited("%s: dst:%p refcnt:%d\n",
__func__, dst, newrefcnt);
- if (unlikely(dst->flags & DST_NOCACHE) && !newrefcnt)
+ if (!newrefcnt && unlikely(nocache))
call_rcu(&dst->rcu_head, dst_destroy_rcu);
}
}
@@ -394,6 +395,14 @@ struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags)
}
EXPORT_SYMBOL_GPL(metadata_dst_alloc);
+void metadata_dst_free(struct metadata_dst *md_dst)
+{
+#ifdef CONFIG_DST_CACHE
+ dst_cache_destroy(&md_dst->u.tun_info.dst_cache);
+#endif
+ kfree(md_dst);
+}
+
struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t flags)
{
int cpu;
diff --git a/net/core/dst_cache.c b/net/core/dst_cache.c
new file mode 100644
index 000000000000..554d36449231
--- /dev/null
+++ b/net/core/dst_cache.c
@@ -0,0 +1,168 @@
+/*
+ * net/core/dst_cache.c - dst entry cache
+ *
+ * Copyright (c) 2016 Paolo Abeni <pabeni@redhat.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include <linux/kernel.h>
+#include <linux/percpu.h>
+#include <net/dst_cache.h>
+#include <net/route.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <net/ip6_fib.h>
+#endif
+#include <uapi/linux/in.h>
+
+struct dst_cache_pcpu {
+ unsigned long refresh_ts;
+ struct dst_entry *dst;
+ u32 cookie;
+ union {
+ struct in_addr in_saddr;
+ struct in6_addr in6_saddr;
+ };
+};
+
+static void dst_cache_per_cpu_dst_set(struct dst_cache_pcpu *dst_cache,
+ struct dst_entry *dst, u32 cookie)
+{
+ dst_release(dst_cache->dst);
+ if (dst)
+ dst_hold(dst);
+
+ dst_cache->cookie = cookie;
+ dst_cache->dst = dst;
+}
+
+static struct dst_entry *dst_cache_per_cpu_get(struct dst_cache *dst_cache,
+ struct dst_cache_pcpu *idst)
+{
+ struct dst_entry *dst;
+
+ dst = idst->dst;
+ if (!dst)
+ goto fail;
+
+ /* the cache already hold a dst reference; it can't go away */
+ dst_hold(dst);
+
+ if (unlikely(!time_after(idst->refresh_ts, dst_cache->reset_ts) ||
+ (dst->obsolete && !dst->ops->check(dst, idst->cookie)))) {
+ dst_cache_per_cpu_dst_set(idst, NULL, 0);
+ dst_release(dst);
+ goto fail;
+ }
+ return dst;
+
+fail:
+ idst->refresh_ts = jiffies;
+ return NULL;
+}
+
+struct dst_entry *dst_cache_get(struct dst_cache *dst_cache)
+{
+ if (!dst_cache->cache)
+ return NULL;
+
+ return dst_cache_per_cpu_get(dst_cache, this_cpu_ptr(dst_cache->cache));
+}
+EXPORT_SYMBOL_GPL(dst_cache_get);
+
+struct rtable *dst_cache_get_ip4(struct dst_cache *dst_cache, __be32 *saddr)
+{
+ struct dst_cache_pcpu *idst;
+ struct dst_entry *dst;
+
+ if (!dst_cache->cache)
+ return NULL;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst = dst_cache_per_cpu_get(dst_cache, idst);
+ if (!dst)
+ return NULL;
+
+ *saddr = idst->in_saddr.s_addr;
+ return container_of(dst, struct rtable, dst);
+}
+EXPORT_SYMBOL_GPL(dst_cache_get_ip4);
+
+void dst_cache_set_ip4(struct dst_cache *dst_cache, struct dst_entry *dst,
+ __be32 saddr)
+{
+ struct dst_cache_pcpu *idst;
+
+ if (!dst_cache->cache)
+ return;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst_cache_per_cpu_dst_set(idst, dst, 0);
+ idst->in_saddr.s_addr = saddr;
+}
+EXPORT_SYMBOL_GPL(dst_cache_set_ip4);
+
+#if IS_ENABLED(CONFIG_IPV6)
+void dst_cache_set_ip6(struct dst_cache *dst_cache, struct dst_entry *dst,
+ const struct in6_addr *addr)
+{
+ struct dst_cache_pcpu *idst;
+
+ if (!dst_cache->cache)
+ return;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst_cache_per_cpu_dst_set(this_cpu_ptr(dst_cache->cache), dst,
+ rt6_get_cookie((struct rt6_info *)dst));
+ idst->in6_saddr = *addr;
+}
+EXPORT_SYMBOL_GPL(dst_cache_set_ip6);
+
+struct dst_entry *dst_cache_get_ip6(struct dst_cache *dst_cache,
+ struct in6_addr *saddr)
+{
+ struct dst_cache_pcpu *idst;
+ struct dst_entry *dst;
+
+ if (!dst_cache->cache)
+ return NULL;
+
+ idst = this_cpu_ptr(dst_cache->cache);
+ dst = dst_cache_per_cpu_get(dst_cache, idst);
+ if (!dst)
+ return NULL;
+
+ *saddr = idst->in6_saddr;
+ return dst;
+}
+EXPORT_SYMBOL_GPL(dst_cache_get_ip6);
+#endif
+
+int dst_cache_init(struct dst_cache *dst_cache, gfp_t gfp)
+{
+ dst_cache->cache = alloc_percpu_gfp(struct dst_cache_pcpu,
+ gfp | __GFP_ZERO);
+ if (!dst_cache->cache)
+ return -ENOMEM;
+
+ dst_cache_reset(dst_cache);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(dst_cache_init);
+
+void dst_cache_destroy(struct dst_cache *dst_cache)
+{
+ int i;
+
+ if (!dst_cache->cache)
+ return;
+
+ for_each_possible_cpu(i)
+ dst_release(per_cpu_ptr(dst_cache->cache, i)->dst);
+
+ free_percpu(dst_cache->cache);
+}
+EXPORT_SYMBOL_GPL(dst_cache_destroy);
diff --git a/net/core/ethtool.c b/net/core/ethtool.c
index 29edf74846fc..f426c5ad6149 100644
--- a/net/core/ethtool.c
+++ b/net/core/ethtool.c
@@ -87,7 +87,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_GSO_UDP_TUNNEL_BIT] = "tx-udp_tnl-segmentation",
[NETIF_F_FCOE_CRC_BIT] = "tx-checksum-fcoe-crc",
- [NETIF_F_SCTP_CSUM_BIT] = "tx-checksum-sctp",
+ [NETIF_F_SCTP_CRC_BIT] = "tx-checksum-sctp",
[NETIF_F_FCOE_MTU_BIT] = "fcoe-mtu",
[NETIF_F_NTUPLE_BIT] = "rx-ntuple-filter",
[NETIF_F_RXHASH_BIT] = "rx-hashing",
@@ -98,6 +98,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_RXALL_BIT] = "rx-all",
[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
[NETIF_F_BUSY_POLL_BIT] = "busy-poll",
+ [NETIF_F_HW_TC_BIT] = "hw-tc-offload",
};
static const char
@@ -191,6 +192,23 @@ static int ethtool_set_features(struct net_device *dev, void __user *useraddr)
return ret;
}
+static int phy_get_sset_count(struct phy_device *phydev)
+{
+ int ret;
+
+ if (phydev->drv->get_sset_count &&
+ phydev->drv->get_strings &&
+ phydev->drv->get_stats) {
+ mutex_lock(&phydev->lock);
+ ret = phydev->drv->get_sset_count(phydev);
+ mutex_unlock(&phydev->lock);
+
+ return ret;
+ }
+
+ return -EOPNOTSUPP;
+}
+
static int __ethtool_get_sset_count(struct net_device *dev, int sset)
{
const struct ethtool_ops *ops = dev->ethtool_ops;
@@ -204,6 +222,13 @@ static int __ethtool_get_sset_count(struct net_device *dev, int sset)
if (sset == ETH_SS_TUNABLES)
return ARRAY_SIZE(tunable_strings);
+ if (sset == ETH_SS_PHY_STATS) {
+ if (dev->phydev)
+ return phy_get_sset_count(dev->phydev);
+ else
+ return -EOPNOTSUPP;
+ }
+
if (ops->get_sset_count && ops->get_strings)
return ops->get_sset_count(dev, sset);
else
@@ -223,7 +248,17 @@ static void __ethtool_get_strings(struct net_device *dev,
sizeof(rss_hash_func_strings));
else if (stringset == ETH_SS_TUNABLES)
memcpy(data, tunable_strings, sizeof(tunable_strings));
- else
+ else if (stringset == ETH_SS_PHY_STATS) {
+ struct phy_device *phydev = dev->phydev;
+
+ if (phydev) {
+ mutex_lock(&phydev->lock);
+ phydev->drv->get_strings(phydev, data);
+ mutex_unlock(&phydev->lock);
+ } else {
+ return;
+ }
+ } else
/* ops->get_strings is valid because checked earlier */
ops->get_strings(dev, stringset, data);
}
@@ -235,7 +270,7 @@ static netdev_features_t ethtool_get_feature_mask(u32 eth_cmd)
switch (eth_cmd) {
case ETHTOOL_GTXCSUM:
case ETHTOOL_STXCSUM:
- return NETIF_F_ALL_CSUM | NETIF_F_SCTP_CSUM;
+ return NETIF_F_CSUM_MASK | NETIF_F_SCTP_CRC;
case ETHTOOL_GRXCSUM:
case ETHTOOL_SRXCSUM:
return NETIF_F_RXCSUM;
@@ -352,43 +387,461 @@ static int __ethtool_set_flags(struct net_device *dev, u32 data)
return 0;
}
-int __ethtool_get_settings(struct net_device *dev, struct ethtool_cmd *cmd)
+static void convert_legacy_u32_to_link_mode(unsigned long *dst, u32 legacy_u32)
+{
+ bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ dst[0] = legacy_u32;
+}
+
+/* return false if src had higher bits set. lower bits always updated. */
+static bool convert_link_mode_to_legacy_u32(u32 *legacy_u32,
+ const unsigned long *src)
+{
+ bool retval = true;
+
+ /* TODO: following test will soon always be true */
+ if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) {
+ __ETHTOOL_DECLARE_LINK_MODE_MASK(ext);
+
+ bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_fill(ext, 32);
+ bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS);
+ if (bitmap_intersects(ext, src,
+ __ETHTOOL_LINK_MODE_MASK_NBITS)) {
+ /* src mask goes beyond bit 31 */
+ retval = false;
+ }
+ }
+ *legacy_u32 = src[0];
+ return retval;
+}
+
+/* return false if legacy contained non-0 deprecated fields
+ * transceiver/maxtxpkt/maxrxpkt. rest of ksettings always updated
+ */
+static bool
+convert_legacy_settings_to_link_ksettings(
+ struct ethtool_link_ksettings *link_ksettings,
+ const struct ethtool_cmd *legacy_settings)
+{
+ bool retval = true;
+
+ memset(link_ksettings, 0, sizeof(*link_ksettings));
+
+ /* This is used to tell users that driver is still using these
+ * deprecated legacy fields, and they should not use
+ * %ETHTOOL_GLINKSETTINGS/%ETHTOOL_SLINKSETTINGS
+ */
+ if (legacy_settings->transceiver ||
+ legacy_settings->maxtxpkt ||
+ legacy_settings->maxrxpkt)
+ retval = false;
+
+ convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.supported,
+ legacy_settings->supported);
+ convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.advertising,
+ legacy_settings->advertising);
+ convert_legacy_u32_to_link_mode(
+ link_ksettings->link_modes.lp_advertising,
+ legacy_settings->lp_advertising);
+ link_ksettings->base.speed
+ = ethtool_cmd_speed(legacy_settings);
+ link_ksettings->base.duplex
+ = legacy_settings->duplex;
+ link_ksettings->base.port
+ = legacy_settings->port;
+ link_ksettings->base.phy_address
+ = legacy_settings->phy_address;
+ link_ksettings->base.autoneg
+ = legacy_settings->autoneg;
+ link_ksettings->base.mdio_support
+ = legacy_settings->mdio_support;
+ link_ksettings->base.eth_tp_mdix
+ = legacy_settings->eth_tp_mdix;
+ link_ksettings->base.eth_tp_mdix_ctrl
+ = legacy_settings->eth_tp_mdix_ctrl;
+ return retval;
+}
+
+/* return false if ksettings link modes had higher bits
+ * set. legacy_settings always updated (best effort)
+ */
+static bool
+convert_link_ksettings_to_legacy_settings(
+ struct ethtool_cmd *legacy_settings,
+ const struct ethtool_link_ksettings *link_ksettings)
+{
+ bool retval = true;
+
+ memset(legacy_settings, 0, sizeof(*legacy_settings));
+ /* this also clears the deprecated fields in legacy structure:
+ * __u8 transceiver;
+ * __u32 maxtxpkt;
+ * __u32 maxrxpkt;
+ */
+
+ retval &= convert_link_mode_to_legacy_u32(
+ &legacy_settings->supported,
+ link_ksettings->link_modes.supported);
+ retval &= convert_link_mode_to_legacy_u32(
+ &legacy_settings->advertising,
+ link_ksettings->link_modes.advertising);
+ retval &= convert_link_mode_to_legacy_u32(
+ &legacy_settings->lp_advertising,
+ link_ksettings->link_modes.lp_advertising);
+ ethtool_cmd_speed_set(legacy_settings, link_ksettings->base.speed);
+ legacy_settings->duplex
+ = link_ksettings->base.duplex;
+ legacy_settings->port
+ = link_ksettings->base.port;
+ legacy_settings->phy_address
+ = link_ksettings->base.phy_address;
+ legacy_settings->autoneg
+ = link_ksettings->base.autoneg;
+ legacy_settings->mdio_support
+ = link_ksettings->base.mdio_support;
+ legacy_settings->eth_tp_mdix
+ = link_ksettings->base.eth_tp_mdix;
+ legacy_settings->eth_tp_mdix_ctrl
+ = link_ksettings->base.eth_tp_mdix_ctrl;
+ return retval;
+}
+
+/* number of 32-bit words to store the user's link mode bitmaps */
+#define __ETHTOOL_LINK_MODE_MASK_NU32 \
+ DIV_ROUND_UP(__ETHTOOL_LINK_MODE_MASK_NBITS, 32)
+
+/* layout of the struct passed from/to userland */
+struct ethtool_link_usettings {
+ struct ethtool_link_settings base;
+ struct {
+ __u32 supported[__ETHTOOL_LINK_MODE_MASK_NU32];
+ __u32 advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+ __u32 lp_advertising[__ETHTOOL_LINK_MODE_MASK_NU32];
+ } link_modes;
+};
+
+/* Internal kernel helper to query a device ethtool_link_settings.
+ *
+ * Backward compatibility note: for compatibility with legacy drivers
+ * that implement only the ethtool_cmd API, this has to work with both
+ * drivers implementing get_link_ksettings API and drivers
+ * implementing get_settings API. When drivers implement get_settings
+ * and report ethtool_cmd deprecated fields
+ * (transceiver/maxrxpkt/maxtxpkt), these fields are silently ignored
+ * because the resulting struct ethtool_link_settings does not report them.
+ */
+int __ethtool_get_link_ksettings(struct net_device *dev,
+ struct ethtool_link_ksettings *link_ksettings)
{
+ int err;
+ struct ethtool_cmd cmd;
+
ASSERT_RTNL();
+ if (dev->ethtool_ops->get_link_ksettings) {
+ memset(link_ksettings, 0, sizeof(*link_ksettings));
+ return dev->ethtool_ops->get_link_ksettings(dev,
+ link_ksettings);
+ }
+
+ /* driver doesn't support %ethtool_link_ksettings API. revert to
+ * legacy %ethtool_cmd API, unless it's not supported either.
+ * TODO: remove when ethtool_ops::get_settings disappears internally
+ */
if (!dev->ethtool_ops->get_settings)
return -EOPNOTSUPP;
- memset(cmd, 0, sizeof(struct ethtool_cmd));
- cmd->cmd = ETHTOOL_GSET;
- return dev->ethtool_ops->get_settings(dev, cmd);
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.cmd = ETHTOOL_GSET;
+ err = dev->ethtool_ops->get_settings(dev, &cmd);
+ if (err < 0)
+ return err;
+
+ /* we ignore deprecated fields transceiver/maxrxpkt/maxtxpkt
+ */
+ convert_legacy_settings_to_link_ksettings(link_ksettings, &cmd);
+ return err;
}
-EXPORT_SYMBOL(__ethtool_get_settings);
+EXPORT_SYMBOL(__ethtool_get_link_ksettings);
-static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+/* convert ethtool_link_usettings in user space to a kernel internal
+ * ethtool_link_ksettings. return 0 on success, errno on error.
+ */
+static int load_link_ksettings_from_user(struct ethtool_link_ksettings *to,
+ const void __user *from)
{
- int err;
- struct ethtool_cmd cmd;
+ struct ethtool_link_usettings link_usettings;
+
+ if (copy_from_user(&link_usettings, from, sizeof(link_usettings)))
+ return -EFAULT;
+
+ memcpy(&to->base, &link_usettings.base, sizeof(to->base));
+ bitmap_from_u32array(to->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_usettings.link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NU32);
+ bitmap_from_u32array(to->link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_usettings.link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NU32);
+ bitmap_from_u32array(to->link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS,
+ link_usettings.link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NU32);
+
+ return 0;
+}
+
+/* convert a kernel internal ethtool_link_ksettings to
+ * ethtool_link_usettings in user space. return 0 on success, errno on
+ * error.
+ */
+static int
+store_link_ksettings_for_user(void __user *to,
+ const struct ethtool_link_ksettings *from)
+{
+ struct ethtool_link_usettings link_usettings;
+
+ memcpy(&link_usettings.base, &from->base, sizeof(link_usettings));
+ bitmap_to_u32array(link_usettings.link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NU32,
+ from->link_modes.supported,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_to_u32array(link_usettings.link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NU32,
+ from->link_modes.advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+ bitmap_to_u32array(link_usettings.link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NU32,
+ from->link_modes.lp_advertising,
+ __ETHTOOL_LINK_MODE_MASK_NBITS);
+
+ if (copy_to_user(to, &link_usettings, sizeof(link_usettings)))
+ return -EFAULT;
+
+ return 0;
+}
- err = __ethtool_get_settings(dev, &cmd);
+/* Query device for its ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::get_link_ksettings, even if legacy
+ * ethtool_ops::get_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_GSET for
+ * this driver, so that they can correctly access the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_get_link_ksettings(struct net_device *dev,
+ void __user *useraddr)
+{
+ int err = 0;
+ struct ethtool_link_ksettings link_ksettings;
+
+ ASSERT_RTNL();
+
+ if (!dev->ethtool_ops->get_link_ksettings)
+ return -EOPNOTSUPP;
+
+ /* handle bitmap nbits handshake */
+ if (copy_from_user(&link_ksettings.base, useraddr,
+ sizeof(link_ksettings.base)))
+ return -EFAULT;
+
+ if (__ETHTOOL_LINK_MODE_MASK_NU32
+ != link_ksettings.base.link_mode_masks_nwords) {
+ /* wrong link mode nbits requested */
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
+ /* send back number of words required as negative val */
+ compiletime_assert(__ETHTOOL_LINK_MODE_MASK_NU32 <= S8_MAX,
+ "need too many bits for link modes!");
+ link_ksettings.base.link_mode_masks_nwords
+ = -((s8)__ETHTOOL_LINK_MODE_MASK_NU32);
+
+ /* copy the base fields back to user, not the link
+ * mode bitmaps
+ */
+ if (copy_to_user(useraddr, &link_ksettings.base,
+ sizeof(link_ksettings.base)))
+ return -EFAULT;
+
+ return 0;
+ }
+
+ /* handshake successful: user/kernel agree on
+ * link_mode_masks_nwords
+ */
+
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ err = dev->ethtool_ops->get_link_ksettings(dev, &link_ksettings);
if (err < 0)
return err;
+ /* make sure we tell the right values to user */
+ link_ksettings.base.cmd = ETHTOOL_GLINKSETTINGS;
+ link_ksettings.base.link_mode_masks_nwords
+ = __ETHTOOL_LINK_MODE_MASK_NU32;
+
+ return store_link_ksettings_for_user(useraddr, &link_ksettings);
+}
+
+/* Update device ethtool_link_settings.
+ *
+ * Backward compatibility note: this function must fail when driver
+ * does not implement ethtool::set_link_ksettings, even if legacy
+ * ethtool_ops::set_settings is implemented. This tells new versions
+ * of ethtool that they should use the legacy API %ETHTOOL_SSET for
+ * this driver, so that they can correctly update the ethtool_cmd
+ * deprecated fields (transceiver/maxrxpkt/maxtxpkt), until no driver
+ * implements ethtool_ops::get_settings anymore.
+ */
+static int ethtool_set_link_ksettings(struct net_device *dev,
+ void __user *useraddr)
+{
+ int err;
+ struct ethtool_link_ksettings link_ksettings;
+
+ ASSERT_RTNL();
+
+ if (!dev->ethtool_ops->set_link_ksettings)
+ return -EOPNOTSUPP;
+
+ /* make sure nbits field has expected value */
+ if (copy_from_user(&link_ksettings.base, useraddr,
+ sizeof(link_ksettings.base)))
+ return -EFAULT;
+
+ if (__ETHTOOL_LINK_MODE_MASK_NU32
+ != link_ksettings.base.link_mode_masks_nwords)
+ return -EINVAL;
+
+ /* copy the whole structure, now that we know it has expected
+ * format
+ */
+ err = load_link_ksettings_from_user(&link_ksettings, useraddr);
+ if (err)
+ return err;
+
+ /* re-check nwords field, just in case */
+ if (__ETHTOOL_LINK_MODE_MASK_NU32
+ != link_ksettings.base.link_mode_masks_nwords)
+ return -EINVAL;
+
+ return dev->ethtool_ops->set_link_ksettings(dev, &link_ksettings);
+}
+
+static void
+warn_incomplete_ethtool_legacy_settings_conversion(const char *details)
+{
+ char name[sizeof(current->comm)];
+
+ pr_info_once("warning: `%s' uses legacy ethtool link settings API, %s\n",
+ get_task_comm(name, current), details);
+}
+
+/* Query device for its ethtool_cmd settings.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing get_link_ksettings
+ * API and drivers implementing get_settings API. When drivers
+ * implement get_link_ksettings and report higher link mode bits, a
+ * kernel warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, but the command is successful
+ * (only the lower link mode bits reported back to user).
+ */
+static int ethtool_get_settings(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_cmd cmd;
+
+ ASSERT_RTNL();
+
+ if (dev->ethtool_ops->get_link_ksettings) {
+ /* First, use link_ksettings API if it is supported */
+ int err;
+ struct ethtool_link_ksettings link_ksettings;
+
+ memset(&link_ksettings, 0, sizeof(link_ksettings));
+ err = dev->ethtool_ops->get_link_ksettings(dev,
+ &link_ksettings);
+ if (err < 0)
+ return err;
+ if (!convert_link_ksettings_to_legacy_settings(&cmd,
+ &link_ksettings))
+ warn_incomplete_ethtool_legacy_settings_conversion(
+ "link modes are only partially reported");
+
+ /* send a sensible cmd tag back to user */
+ cmd.cmd = ETHTOOL_GSET;
+ } else {
+ /* driver doesn't support %ethtool_link_ksettings
+ * API. revert to legacy %ethtool_cmd API, unless it's
+ * not supported either.
+ */
+ int err;
+
+ if (!dev->ethtool_ops->get_settings)
+ return -EOPNOTSUPP;
+
+ memset(&cmd, 0, sizeof(cmd));
+ cmd.cmd = ETHTOOL_GSET;
+ err = dev->ethtool_ops->get_settings(dev, &cmd);
+ if (err < 0)
+ return err;
+ }
+
if (copy_to_user(useraddr, &cmd, sizeof(cmd)))
return -EFAULT;
+
return 0;
}
+/* Update device link settings with given ethtool_cmd.
+ *
+ * Backward compatibility note: for compatibility with legacy ethtool,
+ * this has to work with both drivers implementing set_link_ksettings
+ * API and drivers implementing set_settings API. When drivers
+ * implement set_link_ksettings and user's request updates deprecated
+ * ethtool_cmd fields (transceiver/maxrxpkt/maxtxpkt), a kernel
+ * warning is logged once (with name of 1st driver/device) to
+ * recommend user to upgrade ethtool, and the request is rejected.
+ */
static int ethtool_set_settings(struct net_device *dev, void __user *useraddr)
{
struct ethtool_cmd cmd;
- if (!dev->ethtool_ops->set_settings)
- return -EOPNOTSUPP;
+ ASSERT_RTNL();
if (copy_from_user(&cmd, useraddr, sizeof(cmd)))
return -EFAULT;
+ /* first, try new %ethtool_link_ksettings API. */
+ if (dev->ethtool_ops->set_link_ksettings) {
+ struct ethtool_link_ksettings link_ksettings;
+
+ if (!convert_legacy_settings_to_link_ksettings(&link_ksettings,
+ &cmd))
+ return -EINVAL;
+
+ link_ksettings.base.cmd = ETHTOOL_SLINKSETTINGS;
+ link_ksettings.base.link_mode_masks_nwords
+ = __ETHTOOL_LINK_MODE_MASK_NU32;
+ return dev->ethtool_ops->set_link_ksettings(dev,
+ &link_ksettings);
+ }
+
+ /* legacy %ethtool_cmd API */
+
+ /* TODO: return -EOPNOTSUPP when ethtool_ops::get_settings
+ * disappears internally
+ */
+
+ if (!dev->ethtool_ops->set_settings)
+ return -EOPNOTSUPP;
+
return dev->ethtool_ops->set_settings(dev, &cmd);
}
@@ -598,7 +1051,7 @@ static int ethtool_copy_validate_indir(u32 *indir, void __user *useraddr,
return 0;
}
-u8 netdev_rss_key[NETDEV_RSS_KEY_LEN];
+u8 netdev_rss_key[NETDEV_RSS_KEY_LEN] __read_mostly;
void netdev_rss_key_fill(void *buffer, size_t len)
{
@@ -608,6 +1061,37 @@ void netdev_rss_key_fill(void *buffer, size_t len)
}
EXPORT_SYMBOL(netdev_rss_key_fill);
+static int ethtool_get_max_rxfh_channel(struct net_device *dev, u32 *max)
+{
+ u32 dev_size, current_max = 0;
+ u32 *indir;
+ int ret;
+
+ if (!dev->ethtool_ops->get_rxfh_indir_size ||
+ !dev->ethtool_ops->get_rxfh)
+ return -EOPNOTSUPP;
+ dev_size = dev->ethtool_ops->get_rxfh_indir_size(dev);
+ if (dev_size == 0)
+ return -EOPNOTSUPP;
+
+ indir = kcalloc(dev_size, sizeof(indir[0]), GFP_USER);
+ if (!indir)
+ return -ENOMEM;
+
+ ret = dev->ethtool_ops->get_rxfh(dev, indir, NULL, NULL);
+ if (ret)
+ goto out;
+
+ while (dev_size--)
+ current_max = max(current_max, indir[dev_size]);
+
+ *max = current_max;
+
+out:
+ kfree(indir);
+ return ret;
+}
+
static noinline_for_stack int ethtool_get_rxfh_indir(struct net_device *dev,
void __user *useraddr)
{
@@ -704,6 +1188,14 @@ static noinline_for_stack int ethtool_set_rxfh_indir(struct net_device *dev,
}
ret = ops->set_rxfh(dev, indir, NULL, ETH_RSS_HASH_NO_CHANGE);
+ if (ret)
+ goto out;
+
+ /* indicate whether rxfh was set to default */
+ if (user_size == 0)
+ dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+ else
+ dev->priv_flags |= IFF_RXFH_CONFIGURED;
out:
kfree(indir);
@@ -863,6 +1355,14 @@ static noinline_for_stack int ethtool_set_rxfh(struct net_device *dev,
}
ret = ops->set_rxfh(dev, indir, hkey, rxfh.hfunc);
+ if (ret)
+ goto out;
+
+ /* indicate whether rxfh was set to default */
+ if (rxfh.indir_size == 0)
+ dev->priv_flags &= ~IFF_RXFH_CONFIGURED;
+ else if (rxfh.indir_size != ETH_RXFH_INDIR_NO_CHANGE)
+ dev->priv_flags |= IFF_RXFH_CONFIGURED;
out:
kfree(rss_config);
@@ -1193,14 +1693,31 @@ static noinline_for_stack int ethtool_get_channels(struct net_device *dev,
static noinline_for_stack int ethtool_set_channels(struct net_device *dev,
void __user *useraddr)
{
- struct ethtool_channels channels;
+ struct ethtool_channels channels, max;
+ u32 max_rx_in_use = 0;
- if (!dev->ethtool_ops->set_channels)
+ if (!dev->ethtool_ops->set_channels || !dev->ethtool_ops->get_channels)
return -EOPNOTSUPP;
if (copy_from_user(&channels, useraddr, sizeof(channels)))
return -EFAULT;
+ dev->ethtool_ops->get_channels(dev, &max);
+
+ /* ensure new counts are within the maximums */
+ if ((channels.rx_count > max.max_rx) ||
+ (channels.tx_count > max.max_tx) ||
+ (channels.combined_count > max.max_combined) ||
+ (channels.other_count > max.max_other))
+ return -EINVAL;
+
+ /* ensure the new Rx count fits within the configured Rx flow
+ * indirection table settings */
+ if (netif_is_rxfh_configured(dev) &&
+ !ethtool_get_max_rxfh_channel(dev, &max_rx_in_use) &&
+ (channels.combined_count + channels.rx_count) <= max_rx_in_use)
+ return -EINVAL;
+
return dev->ethtool_ops->set_channels(dev, &channels);
}
@@ -1401,6 +1918,47 @@ static int ethtool_get_stats(struct net_device *dev, void __user *useraddr)
return ret;
}
+static int ethtool_get_phy_stats(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_stats stats;
+ struct phy_device *phydev = dev->phydev;
+ u64 *data;
+ int ret, n_stats;
+
+ if (!phydev)
+ return -EOPNOTSUPP;
+
+ n_stats = phy_get_sset_count(phydev);
+
+ if (n_stats < 0)
+ return n_stats;
+ WARN_ON(n_stats == 0);
+
+ if (copy_from_user(&stats, useraddr, sizeof(stats)))
+ return -EFAULT;
+
+ stats.n_stats = n_stats;
+ data = kmalloc_array(n_stats, sizeof(u64), GFP_USER);
+ if (!data)
+ return -ENOMEM;
+
+ mutex_lock(&phydev->lock);
+ phydev->drv->get_stats(phydev, &stats, data);
+ mutex_unlock(&phydev->lock);
+
+ ret = -EFAULT;
+ if (copy_to_user(useraddr, &stats, sizeof(stats)))
+ goto out;
+ useraddr += sizeof(stats);
+ if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64)))
+ goto out;
+ ret = 0;
+
+ out:
+ kfree(data);
+ return ret;
+}
+
static int ethtool_get_perm_addr(struct net_device *dev, void __user *useraddr)
{
struct ethtool_perm_addr epaddr;
@@ -1748,13 +2306,121 @@ out:
return ret;
}
+static int ethtool_get_per_queue_coalesce(struct net_device *dev,
+ void __user *useraddr,
+ struct ethtool_per_queue_op *per_queue_opt)
+{
+ u32 bit;
+ int ret;
+ DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE);
+
+ if (!dev->ethtool_ops->get_per_queue_coalesce)
+ return -EOPNOTSUPP;
+
+ useraddr += sizeof(*per_queue_opt);
+
+ bitmap_from_u32array(queue_mask,
+ MAX_NUM_QUEUE,
+ per_queue_opt->queue_mask,
+ DIV_ROUND_UP(MAX_NUM_QUEUE, 32));
+
+ for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) {
+ struct ethtool_coalesce coalesce = { .cmd = ETHTOOL_GCOALESCE };
+
+ ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, &coalesce);
+ if (ret != 0)
+ return ret;
+ if (copy_to_user(useraddr, &coalesce, sizeof(coalesce)))
+ return -EFAULT;
+ useraddr += sizeof(coalesce);
+ }
+
+ return 0;
+}
+
+static int ethtool_set_per_queue_coalesce(struct net_device *dev,
+ void __user *useraddr,
+ struct ethtool_per_queue_op *per_queue_opt)
+{
+ u32 bit;
+ int i, ret = 0;
+ int n_queue;
+ struct ethtool_coalesce *backup = NULL, *tmp = NULL;
+ DECLARE_BITMAP(queue_mask, MAX_NUM_QUEUE);
+
+ if ((!dev->ethtool_ops->set_per_queue_coalesce) ||
+ (!dev->ethtool_ops->get_per_queue_coalesce))
+ return -EOPNOTSUPP;
+
+ useraddr += sizeof(*per_queue_opt);
+
+ bitmap_from_u32array(queue_mask,
+ MAX_NUM_QUEUE,
+ per_queue_opt->queue_mask,
+ DIV_ROUND_UP(MAX_NUM_QUEUE, 32));
+ n_queue = bitmap_weight(queue_mask, MAX_NUM_QUEUE);
+ tmp = backup = kmalloc_array(n_queue, sizeof(*backup), GFP_KERNEL);
+ if (!backup)
+ return -ENOMEM;
+
+ for_each_set_bit(bit, queue_mask, MAX_NUM_QUEUE) {
+ struct ethtool_coalesce coalesce;
+
+ ret = dev->ethtool_ops->get_per_queue_coalesce(dev, bit, tmp);
+ if (ret != 0)
+ goto roll_back;
+
+ tmp++;
+
+ if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) {
+ ret = -EFAULT;
+ goto roll_back;
+ }
+
+ ret = dev->ethtool_ops->set_per_queue_coalesce(dev, bit, &coalesce);
+ if (ret != 0)
+ goto roll_back;
+
+ useraddr += sizeof(coalesce);
+ }
+
+roll_back:
+ if (ret != 0) {
+ tmp = backup;
+ for_each_set_bit(i, queue_mask, bit) {
+ dev->ethtool_ops->set_per_queue_coalesce(dev, i, tmp);
+ tmp++;
+ }
+ }
+ kfree(backup);
+
+ return ret;
+}
+
+static int ethtool_set_per_queue(struct net_device *dev, void __user *useraddr)
+{
+ struct ethtool_per_queue_op per_queue_opt;
+
+ if (copy_from_user(&per_queue_opt, useraddr, sizeof(per_queue_opt)))
+ return -EFAULT;
+
+ switch (per_queue_opt.sub_command) {
+ case ETHTOOL_GCOALESCE:
+ return ethtool_get_per_queue_coalesce(dev, useraddr, &per_queue_opt);
+ case ETHTOOL_SCOALESCE:
+ return ethtool_set_per_queue_coalesce(dev, useraddr, &per_queue_opt);
+ default:
+ return -EOPNOTSUPP;
+ };
+}
+
/* The main entry point in this file. Called from net/core/dev_ioctl.c */
int dev_ethtool(struct net *net, struct ifreq *ifr)
{
struct net_device *dev = __dev_get_by_name(net, ifr->ifr_name);
void __user *useraddr = ifr->ifr_data;
- u32 ethcmd;
+ u32 ethcmd, sub_cmd;
int rc;
netdev_features_t old_features;
@@ -1764,8 +2430,14 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
if (copy_from_user(&ethcmd, useraddr, sizeof(ethcmd)))
return -EFAULT;
+ if (ethcmd == ETHTOOL_PERQUEUE) {
+ if (copy_from_user(&sub_cmd, useraddr + sizeof(ethcmd), sizeof(sub_cmd)))
+ return -EFAULT;
+ } else {
+ sub_cmd = ethcmd;
+ }
/* Allow some commands to be done by anyone */
- switch (ethcmd) {
+ switch (sub_cmd) {
case ETHTOOL_GSET:
case ETHTOOL_GDRVINFO:
case ETHTOOL_GMSGLVL:
@@ -1779,6 +2451,7 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_GSSET_INFO:
case ETHTOOL_GSTRINGS:
case ETHTOOL_GSTATS:
+ case ETHTOOL_GPHYSTATS:
case ETHTOOL_GTSO:
case ETHTOOL_GPERMADDR:
case ETHTOOL_GUFO:
@@ -1991,6 +2664,18 @@ int dev_ethtool(struct net *net, struct ifreq *ifr)
case ETHTOOL_STUNABLE:
rc = ethtool_set_tunable(dev, useraddr);
break;
+ case ETHTOOL_GPHYSTATS:
+ rc = ethtool_get_phy_stats(dev, useraddr);
+ break;
+ case ETHTOOL_PERQUEUE:
+ rc = ethtool_set_per_queue(dev, useraddr);
+ break;
+ case ETHTOOL_GLINKSETTINGS:
+ rc = ethtool_get_link_ksettings(dev, useraddr);
+ break;
+ case ETHTOOL_SLINKSETTINGS:
+ rc = ethtool_set_link_ksettings(dev, useraddr);
+ break;
default:
rc = -EOPNOTSUPP;
}
diff --git a/net/core/filter.c b/net/core/filter.c
index 672eefbfbe99..ca7f832b2980 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -50,6 +50,7 @@
#include <net/cls_cgroup.h>
#include <net/dst_metadata.h>
#include <net/dst.h>
+#include <net/sock_reuseport.h>
/**
* sk_filter - run a packet through a socket filter
@@ -348,12 +349,6 @@ static bool convert_bpf_extensions(struct sock_filter *fp,
* jump offsets, 2nd pass remapping:
* new_prog = kmalloc(sizeof(struct bpf_insn) * new_len);
* bpf_convert_filter(old_prog, old_len, new_prog, &new_len);
- *
- * User BPF's register A is mapped to our BPF register 6, user BPF
- * register X is mapped to BPF register 7; frame pointer is always
- * register 10; Context 'void *ctx' is stored in register 1, that is,
- * for socket filters: ctx == 'struct sk_buff *', for seccomp:
- * ctx == 'struct seccomp_data *'.
*/
static int bpf_convert_filter(struct sock_filter *prog, int len,
struct bpf_insn *new_prog, int *new_len)
@@ -381,9 +376,22 @@ do_pass:
new_insn = new_prog;
fp = prog;
- if (new_insn)
- *new_insn = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
- new_insn++;
+ /* Classic BPF related prologue emission. */
+ if (new_insn) {
+ /* Classic BPF expects A and X to be reset first. These need
+ * to be guaranteed to be the first two instructions.
+ */
+ *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_A, BPF_REG_A);
+ *new_insn++ = BPF_ALU64_REG(BPF_XOR, BPF_REG_X, BPF_REG_X);
+
+ /* All programs must keep CTX in callee saved BPF_REG_CTX.
+ * In eBPF case it's done by the compiler, here we need to
+ * do this ourself. Initial CTX is present in BPF_REG_ARG1.
+ */
+ *new_insn++ = BPF_MOV64_REG(BPF_REG_CTX, BPF_REG_ARG1);
+ } else {
+ new_insn += 3;
+ }
for (i = 0; i < len; fp++, i++) {
struct bpf_insn tmp_insns[6] = { };
@@ -522,12 +530,14 @@ do_pass:
*insn = BPF_MOV64_REG(BPF_REG_A, BPF_REG_TMP);
break;
- /* RET_K, RET_A are remaped into 2 insns. */
+ /* RET_K is remaped into 2 insns. RET_A case doesn't need an
+ * extra mov as BPF_REG_0 is already mapped into BPF_REG_A.
+ */
case BPF_RET | BPF_A:
case BPF_RET | BPF_K:
- *insn++ = BPF_MOV32_RAW(BPF_RVAL(fp->code) == BPF_K ?
- BPF_K : BPF_X, BPF_REG_0,
- BPF_REG_A, fp->k);
+ if (BPF_RVAL(fp->code) == BPF_K)
+ *insn++ = BPF_MOV32_RAW(BPF_K, BPF_REG_0,
+ 0, fp->k);
*insn = BPF_EXIT_INSN();
break;
@@ -777,6 +787,11 @@ static int bpf_check_classic(const struct sock_filter *filter,
if (ftest->k == 0)
return -EINVAL;
break;
+ case BPF_ALU | BPF_LSH | BPF_K:
+ case BPF_ALU | BPF_RSH | BPF_K:
+ if (ftest->k >= 32)
+ return -EINVAL;
+ break;
case BPF_LD | BPF_MEM:
case BPF_LDX | BPF_MEM:
case BPF_ST:
@@ -1134,7 +1149,8 @@ void bpf_prog_destroy(struct bpf_prog *fp)
}
EXPORT_SYMBOL_GPL(bpf_prog_destroy);
-static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
+static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk,
+ bool locked)
{
struct sk_filter *fp, *old_fp;
@@ -1150,27 +1166,40 @@ static int __sk_attach_prog(struct bpf_prog *prog, struct sock *sk)
return -ENOMEM;
}
- old_fp = rcu_dereference_protected(sk->sk_filter,
- sock_owned_by_user(sk));
+ old_fp = rcu_dereference_protected(sk->sk_filter, locked);
rcu_assign_pointer(sk->sk_filter, fp);
-
if (old_fp)
sk_filter_uncharge(sk, old_fp);
return 0;
}
-/**
- * sk_attach_filter - attach a socket filter
- * @fprog: the filter program
- * @sk: the socket to use
- *
- * Attach the user's filter code. We first run some sanity checks on
- * it to make sure it does not explode on us later. If an error
- * occurs or there is insufficient memory for the filter a negative
- * errno code is returned. On success the return is zero.
- */
-int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+static int __reuseport_attach_prog(struct bpf_prog *prog, struct sock *sk)
+{
+ struct bpf_prog *old_prog;
+ int err;
+
+ if (bpf_prog_size(prog->len) > sysctl_optmem_max)
+ return -ENOMEM;
+
+ if (sk_unhashed(sk) && sk->sk_reuseport) {
+ err = reuseport_alloc(sk);
+ if (err)
+ return err;
+ } else if (!rcu_access_pointer(sk->sk_reuseport_cb)) {
+ /* The socket wasn't bound with SO_REUSEPORT */
+ return -EINVAL;
+ }
+
+ old_prog = reuseport_attach_prog(sk, prog);
+ if (old_prog)
+ bpf_prog_destroy(old_prog);
+
+ return 0;
+}
+
+static
+struct bpf_prog *__get_filter(struct sock_fprog *fprog, struct sock *sk)
{
unsigned int fsize = bpf_classic_proglen(fprog);
unsigned int bpf_fsize = bpf_prog_size(fprog->len);
@@ -1178,19 +1207,19 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
int err;
if (sock_flag(sk, SOCK_FILTER_LOCKED))
- return -EPERM;
+ return ERR_PTR(-EPERM);
/* Make sure new filter is there and in the right amounts. */
if (fprog->filter == NULL)
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
prog = bpf_prog_alloc(bpf_fsize, 0);
if (!prog)
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
if (copy_from_user(prog->insns, fprog->filter, fsize)) {
__bpf_prog_free(prog);
- return -EFAULT;
+ return ERR_PTR(-EFAULT);
}
prog->len = fprog->len;
@@ -1198,17 +1227,35 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
err = bpf_prog_store_orig_filter(prog, fprog);
if (err) {
__bpf_prog_free(prog);
- return -ENOMEM;
+ return ERR_PTR(-ENOMEM);
}
/* bpf_prepare_filter() already takes care of freeing
* memory in case something goes wrong.
*/
- prog = bpf_prepare_filter(prog, NULL);
+ return bpf_prepare_filter(prog, NULL);
+}
+
+/**
+ * sk_attach_filter - attach a socket filter
+ * @fprog: the filter program
+ * @sk: the socket to use
+ *
+ * Attach the user's filter code. We first run some sanity checks on
+ * it to make sure it does not explode on us later. If an error
+ * occurs or there is insufficient memory for the filter a negative
+ * errno code is returned. On success the return is zero.
+ */
+int __sk_attach_filter(struct sock_fprog *fprog, struct sock *sk,
+ bool locked)
+{
+ struct bpf_prog *prog = __get_filter(fprog, sk);
+ int err;
+
if (IS_ERR(prog))
return PTR_ERR(prog);
- err = __sk_attach_prog(prog, sk);
+ err = __sk_attach_prog(prog, sk, locked);
if (err < 0) {
__bpf_prog_release(prog);
return err;
@@ -1216,26 +1263,75 @@ int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
return 0;
}
-EXPORT_SYMBOL_GPL(sk_attach_filter);
+EXPORT_SYMBOL_GPL(__sk_attach_filter);
-int sk_attach_bpf(u32 ufd, struct sock *sk)
+int sk_attach_filter(struct sock_fprog *fprog, struct sock *sk)
{
- struct bpf_prog *prog;
+ return __sk_attach_filter(fprog, sk, sock_owned_by_user(sk));
+}
+
+int sk_reuseport_attach_filter(struct sock_fprog *fprog, struct sock *sk)
+{
+ struct bpf_prog *prog = __get_filter(fprog, sk);
int err;
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = __reuseport_attach_prog(prog, sk);
+ if (err < 0) {
+ __bpf_prog_release(prog);
+ return err;
+ }
+
+ return 0;
+}
+
+static struct bpf_prog *__get_bpf(u32 ufd, struct sock *sk)
+{
+ struct bpf_prog *prog;
+
if (sock_flag(sk, SOCK_FILTER_LOCKED))
- return -EPERM;
+ return ERR_PTR(-EPERM);
prog = bpf_prog_get(ufd);
if (IS_ERR(prog))
- return PTR_ERR(prog);
+ return prog;
if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
bpf_prog_put(prog);
- return -EINVAL;
+ return ERR_PTR(-EINVAL);
+ }
+
+ return prog;
+}
+
+int sk_attach_bpf(u32 ufd, struct sock *sk)
+{
+ struct bpf_prog *prog = __get_bpf(ufd, sk);
+ int err;
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = __sk_attach_prog(prog, sk, sock_owned_by_user(sk));
+ if (err < 0) {
+ bpf_prog_put(prog);
+ return err;
}
- err = __sk_attach_prog(prog, sk);
+ return 0;
+}
+
+int sk_reuseport_attach_bpf(u32 ufd, struct sock *sk)
+{
+ struct bpf_prog *prog = __get_bpf(ufd, sk);
+ int err;
+
+ if (IS_ERR(prog))
+ return PTR_ERR(prog);
+
+ err = __reuseport_attach_prog(prog, sk);
if (err < 0) {
bpf_prog_put(prog);
return err;
@@ -1244,17 +1340,27 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
return 0;
}
-#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1)
+struct bpf_scratchpad {
+ union {
+ __be32 diff[MAX_BPF_STACK / sizeof(__be32)];
+ u8 buff[MAX_BPF_STACK];
+ };
+};
+
+static DEFINE_PER_CPU(struct bpf_scratchpad, bpf_sp);
static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
{
+ struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
struct sk_buff *skb = (struct sk_buff *) (long) r1;
int offset = (int) r2;
void *from = (void *) (long) r3;
unsigned int len = (unsigned int) r4;
- char buf[16];
void *ptr;
+ if (unlikely(flags & ~(BPF_F_RECOMPUTE_CSUM | BPF_F_INVALIDATE_HASH)))
+ return -EINVAL;
+
/* bpf verifier guarantees that:
* 'from' pointer points to bpf program stack
* 'len' bytes of it were initialized
@@ -1263,32 +1369,33 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
*
* so check for invalid 'offset' and too large 'len'
*/
- if (unlikely((u32) offset > 0xffff || len > sizeof(buf)))
+ if (unlikely((u32) offset > 0xffff || len > sizeof(sp->buff)))
return -EFAULT;
-
- if (unlikely(skb_cloned(skb) &&
- !skb_clone_writable(skb, offset + len)))
+ if (unlikely(skb_try_make_writable(skb, offset + len)))
return -EFAULT;
- ptr = skb_header_pointer(skb, offset, len, buf);
+ ptr = skb_header_pointer(skb, offset, len, sp->buff);
if (unlikely(!ptr))
return -EFAULT;
- if (BPF_RECOMPUTE_CSUM(flags))
+ if (flags & BPF_F_RECOMPUTE_CSUM)
skb_postpull_rcsum(skb, ptr, len);
memcpy(ptr, from, len);
- if (ptr == buf)
+ if (ptr == sp->buff)
/* skb_store_bits cannot return -EFAULT here */
skb_store_bits(skb, offset, ptr, len);
- if (BPF_RECOMPUTE_CSUM(flags) && skb->ip_summed == CHECKSUM_COMPLETE)
- skb->csum = csum_add(skb->csum, csum_partial(ptr, len, 0));
+ if (flags & BPF_F_RECOMPUTE_CSUM)
+ skb_postpush_rcsum(skb, ptr, len);
+ if (flags & BPF_F_INVALIDATE_HASH)
+ skb_clear_hash(skb);
+
return 0;
}
-const struct bpf_func_proto bpf_skb_store_bytes_proto = {
+static const struct bpf_func_proto bpf_skb_store_bytes_proto = {
.func = bpf_skb_store_bytes,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1299,8 +1406,35 @@ const struct bpf_func_proto bpf_skb_store_bytes_proto = {
.arg5_type = ARG_ANYTHING,
};
-#define BPF_HEADER_FIELD_SIZE(flags) ((flags) & 0x0f)
-#define BPF_IS_PSEUDO_HEADER(flags) ((flags) & 0x10)
+static u64 bpf_skb_load_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
+{
+ const struct sk_buff *skb = (const struct sk_buff *)(unsigned long) r1;
+ int offset = (int) r2;
+ void *to = (void *)(unsigned long) r3;
+ unsigned int len = (unsigned int) r4;
+ void *ptr;
+
+ if (unlikely((u32) offset > 0xffff || len > MAX_BPF_STACK))
+ return -EFAULT;
+
+ ptr = skb_header_pointer(skb, offset, len, to);
+ if (unlikely(!ptr))
+ return -EFAULT;
+ if (ptr != to)
+ memcpy(to, ptr, len);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_load_bytes_proto = {
+ .func = bpf_skb_load_bytes,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_ANYTHING,
+ .arg3_type = ARG_PTR_TO_STACK,
+ .arg4_type = ARG_CONST_STACK_SIZE,
+};
static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
{
@@ -1308,18 +1442,24 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
int offset = (int) r2;
__sum16 sum, *ptr;
+ if (unlikely(flags & ~(BPF_F_HDR_FIELD_MASK)))
+ return -EINVAL;
if (unlikely((u32) offset > 0xffff))
return -EFAULT;
-
- if (unlikely(skb_cloned(skb) &&
- !skb_clone_writable(skb, offset + sizeof(sum))))
+ if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum))))
return -EFAULT;
ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
if (unlikely(!ptr))
return -EFAULT;
- switch (BPF_HEADER_FIELD_SIZE(flags)) {
+ switch (flags & BPF_F_HDR_FIELD_MASK) {
+ case 0:
+ if (unlikely(from != 0))
+ return -EINVAL;
+
+ csum_replace_by_diff(ptr, to);
+ break;
case 2:
csum_replace2(ptr, from, to);
break;
@@ -1337,7 +1477,7 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return 0;
}
-const struct bpf_func_proto bpf_l3_csum_replace_proto = {
+static const struct bpf_func_proto bpf_l3_csum_replace_proto = {
.func = bpf_l3_csum_replace,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1351,22 +1491,32 @@ const struct bpf_func_proto bpf_l3_csum_replace_proto = {
static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1;
- bool is_pseudo = !!BPF_IS_PSEUDO_HEADER(flags);
+ bool is_pseudo = flags & BPF_F_PSEUDO_HDR;
+ bool is_mmzero = flags & BPF_F_MARK_MANGLED_0;
int offset = (int) r2;
__sum16 sum, *ptr;
+ if (unlikely(flags & ~(BPF_F_MARK_MANGLED_0 | BPF_F_PSEUDO_HDR |
+ BPF_F_HDR_FIELD_MASK)))
+ return -EINVAL;
if (unlikely((u32) offset > 0xffff))
return -EFAULT;
-
- if (unlikely(skb_cloned(skb) &&
- !skb_clone_writable(skb, offset + sizeof(sum))))
+ if (unlikely(skb_try_make_writable(skb, offset + sizeof(sum))))
return -EFAULT;
ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
if (unlikely(!ptr))
return -EFAULT;
+ if (is_mmzero && !*ptr)
+ return 0;
+
+ switch (flags & BPF_F_HDR_FIELD_MASK) {
+ case 0:
+ if (unlikely(from != 0))
+ return -EINVAL;
- switch (BPF_HEADER_FIELD_SIZE(flags)) {
+ inet_proto_csum_replace_by_diff(ptr, skb, to, is_pseudo);
+ break;
case 2:
inet_proto_csum_replace2(ptr, skb, from, to, is_pseudo);
break;
@@ -1377,6 +1527,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return -EINVAL;
}
+ if (is_mmzero && !*ptr)
+ *ptr = CSUM_MANGLED_0;
if (ptr == &sum)
/* skb_store_bits guaranteed to not return -EFAULT here */
skb_store_bits(skb, offset, ptr, sizeof(sum));
@@ -1384,7 +1536,7 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
return 0;
}
-const struct bpf_func_proto bpf_l4_csum_replace_proto = {
+static const struct bpf_func_proto bpf_l4_csum_replace_proto = {
.func = bpf_l4_csum_replace,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1395,13 +1547,53 @@ const struct bpf_func_proto bpf_l4_csum_replace_proto = {
.arg5_type = ARG_ANYTHING,
};
-#define BPF_IS_REDIRECT_INGRESS(flags) ((flags) & 1)
+static u64 bpf_csum_diff(u64 r1, u64 from_size, u64 r3, u64 to_size, u64 seed)
+{
+ struct bpf_scratchpad *sp = this_cpu_ptr(&bpf_sp);
+ u64 diff_size = from_size + to_size;
+ __be32 *from = (__be32 *) (long) r1;
+ __be32 *to = (__be32 *) (long) r3;
+ int i, j = 0;
+
+ /* This is quite flexible, some examples:
+ *
+ * from_size == 0, to_size > 0, seed := csum --> pushing data
+ * from_size > 0, to_size == 0, seed := csum --> pulling data
+ * from_size > 0, to_size > 0, seed := 0 --> diffing data
+ *
+ * Even for diffing, from_size and to_size don't need to be equal.
+ */
+ if (unlikely(((from_size | to_size) & (sizeof(__be32) - 1)) ||
+ diff_size > sizeof(sp->diff)))
+ return -EINVAL;
+
+ for (i = 0; i < from_size / sizeof(__be32); i++, j++)
+ sp->diff[j] = ~from[i];
+ for (i = 0; i < to_size / sizeof(__be32); i++, j++)
+ sp->diff[j] = to[i];
+
+ return csum_partial(sp->diff, diff_size, seed);
+}
+
+static const struct bpf_func_proto bpf_csum_diff_proto = {
+ .func = bpf_csum_diff,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_STACK,
+ .arg2_type = ARG_CONST_STACK_SIZE_OR_ZERO,
+ .arg3_type = ARG_PTR_TO_STACK,
+ .arg4_type = ARG_CONST_STACK_SIZE_OR_ZERO,
+ .arg5_type = ARG_ANYTHING,
+};
static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1, *skb2;
struct net_device *dev;
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return -EINVAL;
+
dev = dev_get_by_index_rcu(dev_net(skb->dev), ifindex);
if (unlikely(!dev))
return -EINVAL;
@@ -1410,15 +1602,18 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
if (unlikely(!skb2))
return -ENOMEM;
- if (BPF_IS_REDIRECT_INGRESS(flags))
+ if (flags & BPF_F_INGRESS) {
+ if (skb_at_tc_ingress(skb2))
+ skb_postpush_rcsum(skb2, skb_mac_header(skb2),
+ skb2->mac_len);
return dev_forward_skb(dev, skb2);
+ }
skb2->dev = dev;
- skb_sender_cpu_clear(skb2);
return dev_queue_xmit(skb2);
}
-const struct bpf_func_proto bpf_clone_redirect_proto = {
+static const struct bpf_func_proto bpf_clone_redirect_proto = {
.func = bpf_clone_redirect,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1433,12 +1628,17 @@ struct redirect_info {
};
static DEFINE_PER_CPU(struct redirect_info, redirect_info);
+
static u64 bpf_redirect(u64 ifindex, u64 flags, u64 r3, u64 r4, u64 r5)
{
struct redirect_info *ri = this_cpu_ptr(&redirect_info);
+ if (unlikely(flags & ~(BPF_F_INGRESS)))
+ return TC_ACT_SHOT;
+
ri->ifindex = ifindex;
ri->flags = flags;
+
return TC_ACT_REDIRECT;
}
@@ -1454,15 +1654,18 @@ int skb_do_redirect(struct sk_buff *skb)
return -EINVAL;
}
- if (BPF_IS_REDIRECT_INGRESS(ri->flags))
+ if (ri->flags & BPF_F_INGRESS) {
+ if (skb_at_tc_ingress(skb))
+ skb_postpush_rcsum(skb, skb_mac_header(skb),
+ skb->mac_len);
return dev_forward_skb(dev, skb);
+ }
skb->dev = dev;
- skb_sender_cpu_clear(skb);
return dev_queue_xmit(skb);
}
-const struct bpf_func_proto bpf_redirect_proto = {
+static const struct bpf_func_proto bpf_redirect_proto = {
.func = bpf_redirect,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1484,14 +1687,7 @@ static const struct bpf_func_proto bpf_get_cgroup_classid_proto = {
static u64 bpf_get_route_realm(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5)
{
-#ifdef CONFIG_IP_ROUTE_CLASSID
- const struct dst_entry *dst;
-
- dst = skb_dst((struct sk_buff *) (unsigned long) r1);
- if (dst)
- return dst->tclassid;
-#endif
- return 0;
+ return dst_tclassid((struct sk_buff *) (unsigned long) r1);
}
static const struct bpf_func_proto bpf_get_route_realm_proto = {
@@ -1544,27 +1740,70 @@ bool bpf_helper_changes_skb_data(void *func)
return true;
if (func == bpf_skb_vlan_pop)
return true;
+ if (func == bpf_skb_store_bytes)
+ return true;
+ if (func == bpf_l3_csum_replace)
+ return true;
+ if (func == bpf_l4_csum_replace)
+ return true;
+
return false;
}
+static unsigned short bpf_tunnel_key_af(u64 flags)
+{
+ return flags & BPF_F_TUNINFO_IPV6 ? AF_INET6 : AF_INET;
+}
+
static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
{
struct sk_buff *skb = (struct sk_buff *) (long) r1;
struct bpf_tunnel_key *to = (struct bpf_tunnel_key *) (long) r2;
- struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ u8 compat[sizeof(struct bpf_tunnel_key)];
- if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info))
- return -EINVAL;
- if (ip_tunnel_info_af(info) != AF_INET)
+ if (unlikely(!info || (flags & ~(BPF_F_TUNINFO_IPV6))))
return -EINVAL;
+ if (ip_tunnel_info_af(info) != bpf_tunnel_key_af(flags))
+ return -EPROTO;
+ if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
+ switch (size) {
+ case offsetof(struct bpf_tunnel_key, tunnel_label):
+ case offsetof(struct bpf_tunnel_key, tunnel_ext):
+ goto set_compat;
+ case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
+ /* Fixup deprecated structure layouts here, so we have
+ * a common path later on.
+ */
+ if (ip_tunnel_info_af(info) != AF_INET)
+ return -EINVAL;
+set_compat:
+ to = (struct bpf_tunnel_key *)compat;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
to->tunnel_id = be64_to_cpu(info->key.tun_id);
- to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+ to->tunnel_tos = info->key.tos;
+ to->tunnel_ttl = info->key.ttl;
+
+ if (flags & BPF_F_TUNINFO_IPV6) {
+ memcpy(to->remote_ipv6, &info->key.u.ipv6.src,
+ sizeof(to->remote_ipv6));
+ to->tunnel_label = be32_to_cpu(info->key.label);
+ } else {
+ to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
+ }
+
+ if (unlikely(size != sizeof(struct bpf_tunnel_key)))
+ memcpy((void *)(long) r2, to, size);
return 0;
}
-const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
+static const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
.func = bpf_skb_get_tunnel_key,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1574,6 +1813,32 @@ const struct bpf_func_proto bpf_skb_get_tunnel_key_proto = {
.arg4_type = ARG_ANYTHING,
};
+static u64 bpf_skb_get_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ u8 *to = (u8 *) (long) r2;
+ const struct ip_tunnel_info *info = skb_tunnel_info(skb);
+
+ if (unlikely(!info ||
+ !(info->key.tun_flags & TUNNEL_OPTIONS_PRESENT)))
+ return -ENOENT;
+ if (unlikely(size < info->options_len))
+ return -ENOMEM;
+
+ ip_tunnel_info_opts_get(to, info);
+
+ return info->options_len;
+}
+
+static const struct bpf_func_proto bpf_skb_get_tunnel_opt_proto = {
+ .func = bpf_skb_get_tunnel_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+};
+
static struct metadata_dst __percpu *md_dst;
static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
@@ -1581,9 +1846,30 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
struct sk_buff *skb = (struct sk_buff *) (long) r1;
struct bpf_tunnel_key *from = (struct bpf_tunnel_key *) (long) r2;
struct metadata_dst *md = this_cpu_ptr(md_dst);
+ u8 compat[sizeof(struct bpf_tunnel_key)];
struct ip_tunnel_info *info;
- if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags))
+ if (unlikely(flags & ~(BPF_F_TUNINFO_IPV6 | BPF_F_ZERO_CSUM_TX |
+ BPF_F_DONT_FRAGMENT)))
+ return -EINVAL;
+ if (unlikely(size != sizeof(struct bpf_tunnel_key))) {
+ switch (size) {
+ case offsetof(struct bpf_tunnel_key, tunnel_label):
+ case offsetof(struct bpf_tunnel_key, tunnel_ext):
+ case offsetof(struct bpf_tunnel_key, remote_ipv6[1]):
+ /* Fixup deprecated structure layouts here, so we have
+ * a common path later on.
+ */
+ memcpy(compat, from, size);
+ memset(compat + size, 0, sizeof(compat) - size);
+ from = (struct bpf_tunnel_key *)compat;
+ break;
+ default:
+ return -EINVAL;
+ }
+ }
+ if (unlikely((!(flags & BPF_F_TUNINFO_IPV6) && from->tunnel_label) ||
+ from->tunnel_ext))
return -EINVAL;
skb_dst_drop(skb);
@@ -1592,14 +1878,31 @@ static u64 bpf_skb_set_tunnel_key(u64 r1, u64 r2, u64 size, u64 flags, u64 r5)
info = &md->u.tun_info;
info->mode = IP_TUNNEL_INFO_TX;
- info->key.tun_flags = TUNNEL_KEY;
+
+ info->key.tun_flags = TUNNEL_KEY | TUNNEL_CSUM | TUNNEL_NOCACHE;
+ if (flags & BPF_F_DONT_FRAGMENT)
+ info->key.tun_flags |= TUNNEL_DONT_FRAGMENT;
+
info->key.tun_id = cpu_to_be64(from->tunnel_id);
- info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
+ info->key.tos = from->tunnel_tos;
+ info->key.ttl = from->tunnel_ttl;
+
+ if (flags & BPF_F_TUNINFO_IPV6) {
+ info->mode |= IP_TUNNEL_INFO_IPV6;
+ memcpy(&info->key.u.ipv6.dst, from->remote_ipv6,
+ sizeof(from->remote_ipv6));
+ info->key.label = cpu_to_be32(from->tunnel_label) &
+ IPV6_FLOWLABEL_MASK;
+ } else {
+ info->key.u.ipv4.dst = cpu_to_be32(from->remote_ipv4);
+ if (flags & BPF_F_ZERO_CSUM_TX)
+ info->key.tun_flags &= ~TUNNEL_CSUM;
+ }
return 0;
}
-const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
+static const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
.func = bpf_skb_set_tunnel_key,
.gpl_only = false,
.ret_type = RET_INTEGER,
@@ -1609,17 +1912,53 @@ const struct bpf_func_proto bpf_skb_set_tunnel_key_proto = {
.arg4_type = ARG_ANYTHING,
};
-static const struct bpf_func_proto *bpf_get_skb_set_tunnel_key_proto(void)
+static u64 bpf_skb_set_tunnel_opt(u64 r1, u64 r2, u64 size, u64 r4, u64 r5)
+{
+ struct sk_buff *skb = (struct sk_buff *) (long) r1;
+ u8 *from = (u8 *) (long) r2;
+ struct ip_tunnel_info *info = skb_tunnel_info(skb);
+ const struct metadata_dst *md = this_cpu_ptr(md_dst);
+
+ if (unlikely(info != &md->u.tun_info || (size & (sizeof(u32) - 1))))
+ return -EINVAL;
+ if (unlikely(size > IP_TUNNEL_OPTS_MAX))
+ return -ENOMEM;
+
+ ip_tunnel_info_opts_set(info, from, size);
+
+ return 0;
+}
+
+static const struct bpf_func_proto bpf_skb_set_tunnel_opt_proto = {
+ .func = bpf_skb_set_tunnel_opt,
+ .gpl_only = false,
+ .ret_type = RET_INTEGER,
+ .arg1_type = ARG_PTR_TO_CTX,
+ .arg2_type = ARG_PTR_TO_STACK,
+ .arg3_type = ARG_CONST_STACK_SIZE,
+};
+
+static const struct bpf_func_proto *
+bpf_get_skb_set_tunnel_proto(enum bpf_func_id which)
{
if (!md_dst) {
- /* race is not possible, since it's called from
- * verifier that is holding verifier mutex
+ /* Race is not possible, since it's called from verifier
+ * that is holding verifier mutex.
*/
- md_dst = metadata_dst_alloc_percpu(0, GFP_KERNEL);
+ md_dst = metadata_dst_alloc_percpu(IP_TUNNEL_OPTS_MAX,
+ GFP_KERNEL);
if (!md_dst)
return NULL;
}
- return &bpf_skb_set_tunnel_key_proto;
+
+ switch (which) {
+ case BPF_FUNC_skb_set_tunnel_key:
+ return &bpf_skb_set_tunnel_key_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return &bpf_skb_set_tunnel_opt_proto;
+ default:
+ return NULL;
+ }
}
static const struct bpf_func_proto *
@@ -1654,6 +1993,10 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
switch (func_id) {
case BPF_FUNC_skb_store_bytes:
return &bpf_skb_store_bytes_proto;
+ case BPF_FUNC_skb_load_bytes:
+ return &bpf_skb_load_bytes_proto;
+ case BPF_FUNC_csum_diff:
+ return &bpf_csum_diff_proto;
case BPF_FUNC_l3_csum_replace:
return &bpf_l3_csum_replace_proto;
case BPF_FUNC_l4_csum_replace:
@@ -1669,7 +2012,11 @@ tc_cls_act_func_proto(enum bpf_func_id func_id)
case BPF_FUNC_skb_get_tunnel_key:
return &bpf_skb_get_tunnel_key_proto;
case BPF_FUNC_skb_set_tunnel_key:
- return bpf_get_skb_set_tunnel_key_proto();
+ return bpf_get_skb_set_tunnel_proto(func_id);
+ case BPF_FUNC_skb_get_tunnel_opt:
+ return &bpf_skb_get_tunnel_opt_proto;
+ case BPF_FUNC_skb_set_tunnel_opt:
+ return bpf_get_skb_set_tunnel_proto(func_id);
case BPF_FUNC_redirect:
return &bpf_redirect_proto;
case BPF_FUNC_get_route_realm:
@@ -1718,16 +2065,14 @@ static bool sk_filter_is_valid_access(int off, int size,
static bool tc_cls_act_is_valid_access(int off, int size,
enum bpf_access_type type)
{
- if (off == offsetof(struct __sk_buff, tc_classid))
- return type == BPF_WRITE ? true : false;
-
if (type == BPF_WRITE) {
switch (off) {
case offsetof(struct __sk_buff, mark):
case offsetof(struct __sk_buff, tc_index):
case offsetof(struct __sk_buff, priority):
case offsetof(struct __sk_buff, cb[0]) ...
- offsetof(struct __sk_buff, cb[4]):
+ offsetof(struct __sk_buff, cb[4]):
+ case offsetof(struct __sk_buff, tc_classid):
break;
default:
return false;
@@ -1844,8 +2189,10 @@ static u32 bpf_net_convert_ctx_access(enum bpf_access_type type, int dst_reg,
ctx_off -= offsetof(struct __sk_buff, tc_classid);
ctx_off += offsetof(struct sk_buff, cb);
ctx_off += offsetof(struct qdisc_skb_cb, tc_classid);
- WARN_ON(type != BPF_WRITE);
- *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+ if (type == BPF_WRITE)
+ *insn++ = BPF_STX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
+ else
+ *insn++ = BPF_LDX_MEM(BPF_H, dst_reg, src_reg, ctx_off);
break;
case offsetof(struct __sk_buff, tc_index):
@@ -1908,7 +2255,7 @@ static int __init register_sk_filter_ops(void)
}
late_initcall(register_sk_filter_ops);
-int sk_detach_filter(struct sock *sk)
+int __sk_detach_filter(struct sock *sk, bool locked)
{
int ret = -ENOENT;
struct sk_filter *filter;
@@ -1916,8 +2263,7 @@ int sk_detach_filter(struct sock *sk)
if (sock_flag(sk, SOCK_FILTER_LOCKED))
return -EPERM;
- filter = rcu_dereference_protected(sk->sk_filter,
- sock_owned_by_user(sk));
+ filter = rcu_dereference_protected(sk->sk_filter, locked);
if (filter) {
RCU_INIT_POINTER(sk->sk_filter, NULL);
sk_filter_uncharge(sk, filter);
@@ -1926,7 +2272,12 @@ int sk_detach_filter(struct sock *sk)
return ret;
}
-EXPORT_SYMBOL_GPL(sk_detach_filter);
+EXPORT_SYMBOL_GPL(__sk_detach_filter);
+
+int sk_detach_filter(struct sock *sk)
+{
+ return __sk_detach_filter(sk, sock_owned_by_user(sk));
+}
int sk_get_filter(struct sock *sk, struct sock_filter __user *ubuf,
unsigned int len)
diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index d79699c9d1b9..a669dea146c6 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -19,25 +19,12 @@
#include <net/flow_dissector.h>
#include <scsi/fc/fc_fcoe.h>
-static bool dissector_uses_key(const struct flow_dissector *flow_dissector,
- enum flow_dissector_key_id key_id)
-{
- return flow_dissector->used_keys & (1 << key_id);
-}
-
static void dissector_set_key(struct flow_dissector *flow_dissector,
enum flow_dissector_key_id key_id)
{
flow_dissector->used_keys |= (1 << key_id);
}
-static void *skb_flow_dissector_target(struct flow_dissector *flow_dissector,
- enum flow_dissector_key_id key_id,
- void *target_container)
-{
- return ((char *) target_container) + flow_dissector->offset[key_id];
-}
-
void skb_flow_dissector_init(struct flow_dissector *flow_dissector,
const struct flow_dissector_key *key,
unsigned int key_count)
@@ -178,15 +165,16 @@ ip:
ip_proto = iph->protocol;
- if (!dissector_uses_key(flow_dissector,
- FLOW_DISSECTOR_KEY_IPV4_ADDRS))
- break;
+ if (dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS)) {
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV4_ADDRS,
+ target_container);
- key_addrs = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_IPV4_ADDRS, target_container);
- memcpy(&key_addrs->v4addrs, &iph->saddr,
- sizeof(key_addrs->v4addrs));
- key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ memcpy(&key_addrs->v4addrs, &iph->saddr,
+ sizeof(key_addrs->v4addrs));
+ key_control->addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
+ }
if (ip_is_fragment(iph)) {
key_control->flags |= FLOW_DIS_IS_FRAGMENT;
@@ -208,7 +196,6 @@ ip:
case htons(ETH_P_IPV6): {
const struct ipv6hdr *iph;
struct ipv6hdr _iph;
- __be32 flow_label;
ipv6:
iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, hlen, &_iph);
@@ -220,18 +207,21 @@ ipv6:
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_IPV6_ADDRS)) {
- struct flow_dissector_key_ipv6_addrs *key_ipv6_addrs;
-
- key_ipv6_addrs = skb_flow_dissector_target(flow_dissector,
- FLOW_DISSECTOR_KEY_IPV6_ADDRS,
- target_container);
+ key_addrs = skb_flow_dissector_target(flow_dissector,
+ FLOW_DISSECTOR_KEY_IPV6_ADDRS,
+ target_container);
- memcpy(key_ipv6_addrs, &iph->saddr, sizeof(*key_ipv6_addrs));
+ memcpy(&key_addrs->v6addrs, &iph->saddr,
+ sizeof(key_addrs->v6addrs));
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
- flow_label = ip6_flowlabel(iph);
- if (flow_label) {
+ if ((dissector_uses_key(flow_dissector,
+ FLOW_DISSECTOR_KEY_FLOW_LABEL) ||
+ (flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) &&
+ ip6_flowlabel(iph)) {
+ __be32 flow_label = ip6_flowlabel(iph);
+
if (dissector_uses_key(flow_dissector,
FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
key_tags = skb_flow_dissector_target(flow_dissector,
@@ -336,8 +326,11 @@ mpls:
}
case htons(ETH_P_FCOE):
- key_control->thoff = (u16)(nhoff + FCOE_HEADER_LEN);
- /* fall through */
+ if ((hlen - nhoff) < FCOE_HEADER_LEN)
+ goto out_bad;
+
+ nhoff += FCOE_HEADER_LEN;
+ goto out_good;
default:
goto out_bad;
}
@@ -396,6 +389,13 @@ ip_proto_again:
goto out_bad;
proto = eth->h_proto;
nhoff += sizeof(*eth);
+
+ /* Cap headers that we access via pointers at the
+ * end of the Ethernet header as our maximum alignment
+ * at that point is only 2 bytes.
+ */
+ if (NET_IP_ALIGN)
+ hlen = nhoff;
}
key_control->flags |= FLOW_DIS_ENCAPSULATION;
@@ -437,13 +437,12 @@ ip_proto_again:
key_control->flags |= FLOW_DIS_IS_FRAGMENT;
nhoff += sizeof(_fh);
+ ip_proto = fh->nexthdr;
if (!(fh->frag_off & htons(IP6_OFFSET))) {
key_control->flags |= FLOW_DIS_FIRST_FRAG;
- if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG) {
- ip_proto = fh->nexthdr;
+ if (flags & FLOW_DISSECTOR_F_PARSE_1ST_FRAG)
goto ip_proto_again;
- }
}
goto out_good;
}
@@ -730,6 +729,11 @@ u32 __skb_get_poff(const struct sk_buff *skb, void *data,
{
u32 poff = keys->control.thoff;
+ /* skip L4 headers for fragments after the first */
+ if ((keys->control.flags & FLOW_DIS_IS_FRAGMENT) &&
+ !(keys->control.flags & FLOW_DIS_FIRST_FRAG))
+ return poff;
+
switch (keys->basic.ip_proto) {
case IPPROTO_TCP: {
/* access doff as u8 to avoid unaligned access */
diff --git a/net/core/gen_estimator.c b/net/core/gen_estimator.c
index 92d886f4adcb..4573d81093fe 100644
--- a/net/core/gen_estimator.c
+++ b/net/core/gen_estimator.c
@@ -191,6 +191,7 @@ struct gen_estimator *gen_find_node(const struct gnet_stats_basic_packed *bstats
/**
* gen_new_estimator - create a new rate estimator
* @bstats: basic statistics
+ * @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
* @stats_lock: statistics lock
* @opt: rate estimator configuration TLV
@@ -287,6 +288,7 @@ EXPORT_SYMBOL(gen_kill_estimator);
/**
* gen_replace_estimator - replace rate estimator configuration
* @bstats: basic statistics
+ * @cpu_bstats: bstats per cpu
* @rate_est: rate estimator statistics
* @stats_lock: statistics lock
* @opt: rate estimator configuration TLV
diff --git a/net/core/gen_stats.c b/net/core/gen_stats.c
index 1e2f46a69d50..e640462ea8bf 100644
--- a/net/core/gen_stats.c
+++ b/net/core/gen_stats.c
@@ -140,6 +140,7 @@ EXPORT_SYMBOL(__gnet_stats_copy_basic);
/**
* gnet_stats_copy_basic - copy basic statistics into statistic TLV
* @d: dumping handle
+ * @cpu: copy statistic per cpu
* @b: basic statistics
*
* Appends the basic statistics to the top level TLV created by
diff --git a/net/core/hwbm.c b/net/core/hwbm.c
new file mode 100644
index 000000000000..941c28486896
--- /dev/null
+++ b/net/core/hwbm.c
@@ -0,0 +1,87 @@
+/* Support for hardware buffer manager.
+ *
+ * Copyright (C) 2016 Marvell
+ *
+ * Gregory CLEMENT <gregory.clement@free-electrons.com>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include <linux/kernel.h>
+#include <linux/printk.h>
+#include <linux/skbuff.h>
+#include <net/hwbm.h>
+
+void hwbm_buf_free(struct hwbm_pool *bm_pool, void *buf)
+{
+ if (likely(bm_pool->frag_size <= PAGE_SIZE))
+ skb_free_frag(buf);
+ else
+ kfree(buf);
+}
+EXPORT_SYMBOL_GPL(hwbm_buf_free);
+
+/* Refill processing for HW buffer management */
+int hwbm_pool_refill(struct hwbm_pool *bm_pool, gfp_t gfp)
+{
+ int frag_size = bm_pool->frag_size;
+ void *buf;
+
+ if (likely(frag_size <= PAGE_SIZE))
+ buf = netdev_alloc_frag(frag_size);
+ else
+ buf = kmalloc(frag_size, gfp);
+
+ if (!buf)
+ return -ENOMEM;
+
+ if (bm_pool->construct)
+ if (bm_pool->construct(bm_pool, buf)) {
+ hwbm_buf_free(bm_pool, buf);
+ return -ENOMEM;
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(hwbm_pool_refill);
+
+int hwbm_pool_add(struct hwbm_pool *bm_pool, unsigned int buf_num, gfp_t gfp)
+{
+ int err, i;
+ unsigned long flags;
+
+ spin_lock_irqsave(&bm_pool->lock, flags);
+ if (bm_pool->buf_num == bm_pool->size) {
+ pr_warn("pool already filled\n");
+ return bm_pool->buf_num;
+ }
+
+ if (buf_num + bm_pool->buf_num > bm_pool->size) {
+ pr_warn("cannot allocate %d buffers for pool\n",
+ buf_num);
+ return 0;
+ }
+
+ if ((buf_num + bm_pool->buf_num) < bm_pool->buf_num) {
+ pr_warn("Adding %d buffers to the %d current buffers will overflow\n",
+ buf_num, bm_pool->buf_num);
+ return 0;
+ }
+
+ for (i = 0; i < buf_num; i++) {
+ err = hwbm_pool_refill(bm_pool, gfp);
+ if (err < 0)
+ break;
+ }
+
+ /* Update BM driver with number of buffers added to pool */
+ bm_pool->buf_num += i;
+
+ pr_debug("hwpm pool: %d of %d buffers added\n", i, buf_num);
+ spin_unlock_irqrestore(&bm_pool->lock, flags);
+
+ return i;
+}
+EXPORT_SYMBOL_GPL(hwbm_pool_add);
diff --git a/net/core/lwtunnel.c b/net/core/lwtunnel.c
index 299cfc24d888..669ecc9f884e 100644
--- a/net/core/lwtunnel.c
+++ b/net/core/lwtunnel.c
@@ -27,6 +27,31 @@
#include <net/rtnetlink.h>
#include <net/ip6_fib.h>
+#ifdef CONFIG_MODULES
+
+static const char *lwtunnel_encap_str(enum lwtunnel_encap_types encap_type)
+{
+ /* Only lwt encaps implemented without using an interface for
+ * the encap need to return a string here.
+ */
+ switch (encap_type) {
+ case LWTUNNEL_ENCAP_MPLS:
+ return "MPLS";
+ case LWTUNNEL_ENCAP_ILA:
+ return "ILA";
+ case LWTUNNEL_ENCAP_IP6:
+ case LWTUNNEL_ENCAP_IP:
+ case LWTUNNEL_ENCAP_NONE:
+ case __LWTUNNEL_ENCAP_MAX:
+ /* should not have got here */
+ WARN_ON(1);
+ break;
+ }
+ return NULL;
+}
+
+#endif /* CONFIG_MODULES */
+
struct lwtunnel_state *lwtunnel_state_alloc(int encap_len)
{
struct lwtunnel_state *lws;
@@ -85,6 +110,18 @@ int lwtunnel_build_state(struct net_device *dev, u16 encap_type,
ret = -EOPNOTSUPP;
rcu_read_lock();
ops = rcu_dereference(lwtun_encaps[encap_type]);
+#ifdef CONFIG_MODULES
+ if (!ops) {
+ const char *encap_type_str = lwtunnel_encap_str(encap_type);
+
+ if (encap_type_str) {
+ rcu_read_unlock();
+ request_module("rtnl-lwt-%s", encap_type_str);
+ rcu_read_lock();
+ ops = rcu_dereference(lwtun_encaps[encap_type]);
+ }
+ }
+#endif
if (likely(ops && ops->build_state))
ret = ops->build_state(dev, encap, family, cfg, lws);
rcu_read_unlock();
diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 1aa8437ed6c4..f18ae91b652e 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -857,7 +857,7 @@ static void neigh_probe(struct neighbour *neigh)
struct sk_buff *skb = skb_peek_tail(&neigh->arp_queue);
/* keep skb alive even if arp_queue overflows */
if (skb)
- skb = skb_copy(skb, GFP_ATOMIC);
+ skb = skb_clone(skb, GFP_ATOMIC);
write_unlock(&neigh->lock);
neigh->ops->solicit(neigh, skb);
atomic_inc(&neigh->probes);
@@ -2215,7 +2215,7 @@ static int pneigh_fill_info(struct sk_buff *skb, struct pneigh_entry *pn,
ndm->ndm_pad2 = 0;
ndm->ndm_flags = pn->flags | NTF_PROXY;
ndm->ndm_type = RTN_UNICAST;
- ndm->ndm_ifindex = pn->dev->ifindex;
+ ndm->ndm_ifindex = pn->dev ? pn->dev->ifindex : 0;
ndm->ndm_state = NUD_NONE;
if (nla_put(skb, NDA_DST, tbl->key_len, pn->key))
@@ -2333,7 +2333,7 @@ static int pneigh_dump_table(struct neigh_table *tbl, struct sk_buff *skb,
if (h > s_h)
s_idx = 0;
for (n = tbl->phash_buckets[h], idx = 0; n; n = n->next) {
- if (dev_net(n->dev) != net)
+ if (pneigh_net(n) != net)
continue;
if (idx < s_idx)
goto next;
diff --git a/net/core/net-sysfs.c b/net/core/net-sysfs.c
index f88a62ab019d..2b3f76fe65f4 100644
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -29,7 +29,6 @@
#ifdef CONFIG_SYSFS
static const char fmt_hex[] = "%#x\n";
-static const char fmt_long_hex[] = "%#lx\n";
static const char fmt_dec[] = "%d\n";
static const char fmt_ulong[] = "%lu\n";
static const char fmt_u64[] = "%llu\n";
@@ -199,9 +198,10 @@ static ssize_t speed_show(struct device *dev,
return restart_syscall();
if (netif_running(netdev)) {
- struct ethtool_cmd cmd;
- if (!__ethtool_get_settings(netdev, &cmd))
- ret = sprintf(buf, fmt_dec, ethtool_cmd_speed(&cmd));
+ struct ethtool_link_ksettings cmd;
+
+ if (!__ethtool_get_link_ksettings(netdev, &cmd))
+ ret = sprintf(buf, fmt_dec, cmd.base.speed);
}
rtnl_unlock();
return ret;
@@ -218,10 +218,12 @@ static ssize_t duplex_show(struct device *dev,
return restart_syscall();
if (netif_running(netdev)) {
- struct ethtool_cmd cmd;
- if (!__ethtool_get_settings(netdev, &cmd)) {
+ struct ethtool_link_ksettings cmd;
+
+ if (!__ethtool_get_link_ksettings(netdev, &cmd)) {
const char *duplex;
- switch (cmd.duplex) {
+
+ switch (cmd.base.duplex) {
case DUPLEX_HALF:
duplex = "half";
break;
@@ -471,6 +473,7 @@ static ssize_t phys_switch_id_show(struct device *dev,
if (dev_isalive(netdev)) {
struct switchdev_attr attr = {
+ .orig_dev = netdev,
.id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
.flags = SWITCHDEV_F_NO_RECURSE,
};
@@ -573,6 +576,7 @@ NETSTAT_ENTRY(tx_heartbeat_errors);
NETSTAT_ENTRY(tx_window_errors);
NETSTAT_ENTRY(rx_compressed);
NETSTAT_ENTRY(tx_compressed);
+NETSTAT_ENTRY(rx_nohandler);
static struct attribute *netstat_attrs[] = {
&dev_attr_rx_packets.attr,
@@ -598,6 +602,7 @@ static struct attribute *netstat_attrs[] = {
&dev_attr_tx_window_errors.attr,
&dev_attr_rx_compressed.attr,
&dev_attr_tx_compressed.attr,
+ &dev_attr_rx_nohandler.attr,
NULL
};
@@ -1452,8 +1457,8 @@ static void netdev_release(struct device *d)
static const void *net_namespace(struct device *d)
{
- struct net_device *dev;
- dev = container_of(d, struct net_device, dev);
+ struct net_device *dev = to_net_dev(d);
+
return dev_net(dev);
}
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index adef015b2f41..92da5e4ceb4f 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -32,6 +32,10 @@
#include <trace/events/sock.h>
#include <trace/events/udp.h>
#include <trace/events/fib.h>
+#if IS_ENABLED(CONFIG_IPV6)
+#include <trace/events/fib6.h>
+EXPORT_TRACEPOINT_SYMBOL_GPL(fib6_table_lookup);
+#endif
EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
diff --git a/net/core/netclassid_cgroup.c b/net/core/netclassid_cgroup.c
index 6441f47b1a8f..11fce17274f6 100644
--- a/net/core/netclassid_cgroup.c
+++ b/net/core/netclassid_cgroup.c
@@ -9,7 +9,6 @@
* Authors: Thomas Graf <tgraf@suug.ch>
*/
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/cgroup.h>
#include <linux/fdtable.h>
@@ -56,29 +55,41 @@ static void cgrp_css_free(struct cgroup_subsys_state *css)
kfree(css_cls_state(css));
}
-static int update_classid(const void *v, struct file *file, unsigned n)
+static int update_classid_sock(const void *v, struct file *file, unsigned n)
{
int err;
struct socket *sock = sock_from_file(file, &err);
- if (sock)
- sock->sk->sk_classid = (u32)(unsigned long)v;
-
+ if (sock) {
+ spin_lock(&cgroup_sk_update_lock);
+ sock_cgroup_set_classid(&sock->sk->sk_cgrp_data,
+ (unsigned long)v);
+ spin_unlock(&cgroup_sk_update_lock);
+ }
return 0;
}
-static void cgrp_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+static void update_classid(struct cgroup_subsys_state *css, void *v)
{
- struct cgroup_cls_state *cs = css_cls_state(css);
- void *v = (void *)(unsigned long)cs->classid;
+ struct css_task_iter it;
struct task_struct *p;
- cgroup_taskset_for_each(p, tset) {
+ css_task_iter_start(css, &it);
+ while ((p = css_task_iter_next(&it))) {
task_lock(p);
- iterate_fd(p->files, 0, update_classid, v);
+ iterate_fd(p->files, 0, update_classid_sock, v);
task_unlock(p);
}
+ css_task_iter_end(&it);
+}
+
+static void cgrp_attach(struct cgroup_taskset *tset)
+{
+ struct cgroup_subsys_state *css;
+
+ cgroup_taskset_first(tset, &css);
+ update_classid(css,
+ (void *)(unsigned long)css_cls_state(css)->classid);
}
static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
@@ -89,8 +100,13 @@ static u64 read_classid(struct cgroup_subsys_state *css, struct cftype *cft)
static int write_classid(struct cgroup_subsys_state *css, struct cftype *cft,
u64 value)
{
- css_cls_state(css)->classid = (u32) value;
+ struct cgroup_cls_state *cs = css_cls_state(css);
+
+ cgroup_sk_alloc_disable();
+
+ cs->classid = (u32)value;
+ update_classid(css, (void *)(unsigned long)cs->classid);
return 0;
}
diff --git a/net/core/netprio_cgroup.c b/net/core/netprio_cgroup.c
index cbd0a199bf52..2ec86fc552df 100644
--- a/net/core/netprio_cgroup.c
+++ b/net/core/netprio_cgroup.c
@@ -11,7 +11,6 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
-#include <linux/module.h>
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/string.h>
@@ -27,6 +26,12 @@
#include <linux/fdtable.h>
+/*
+ * netprio allocates per-net_device priomap array which is indexed by
+ * css->id. Limiting css ID to 16bits doesn't lose anything.
+ */
+#define NETPRIO_ID_MAX USHRT_MAX
+
#define PRIOMAP_MIN_SZ 128
/*
@@ -144,6 +149,9 @@ static int cgrp_css_online(struct cgroup_subsys_state *css)
struct net_device *dev;
int ret = 0;
+ if (css->id > NETPRIO_ID_MAX)
+ return -ENOSPC;
+
if (!parent_css)
return 0;
@@ -200,6 +208,8 @@ static ssize_t write_priomap(struct kernfs_open_file *of,
if (!dev)
return -ENODEV;
+ cgroup_sk_alloc_disable();
+
rtnl_lock();
ret = netprio_set_prio(of_css(of), dev, prio);
@@ -213,18 +223,23 @@ static int update_netprio(const void *v, struct file *file, unsigned n)
{
int err;
struct socket *sock = sock_from_file(file, &err);
- if (sock)
- sock->sk->sk_cgrp_prioidx = (u32)(unsigned long)v;
+ if (sock) {
+ spin_lock(&cgroup_sk_update_lock);
+ sock_cgroup_set_prioidx(&sock->sk->sk_cgrp_data,
+ (unsigned long)v);
+ spin_unlock(&cgroup_sk_update_lock);
+ }
return 0;
}
-static void net_prio_attach(struct cgroup_subsys_state *css,
- struct cgroup_taskset *tset)
+static void net_prio_attach(struct cgroup_taskset *tset)
{
struct task_struct *p;
- void *v = (void *)(unsigned long)css->cgroup->id;
+ struct cgroup_subsys_state *css;
+
+ cgroup_taskset_for_each(p, css, tset) {
+ void *v = (void *)(unsigned long)css->cgroup->id;
- cgroup_taskset_for_each(p, tset) {
task_lock(p);
iterate_fd(p->files, 0, update_netprio, v);
task_unlock(p);
diff --git a/net/core/pktgen.c b/net/core/pktgen.c
index de8d5cc5eb24..20999aa596dd 100644
--- a/net/core/pktgen.c
+++ b/net/core/pktgen.c
@@ -2787,7 +2787,9 @@ static struct sk_buff *pktgen_alloc_skb(struct net_device *dev,
} else {
skb = __netdev_alloc_skb(dev, size, GFP_NOWAIT);
}
- skb_reserve(skb, LL_RESERVED_SPACE(dev));
+
+ if (likely(skb))
+ skb_reserve(skb, LL_RESERVED_SPACE(dev));
return skb;
}
@@ -2854,7 +2856,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
*vlan_encapsulated_proto = htons(ETH_P_IP);
}
- skb_set_mac_header(skb, 0);
+ skb_reset_mac_header(skb);
skb_set_network_header(skb, skb->len);
iph = (struct iphdr *) skb_put(skb, sizeof(struct iphdr));
@@ -2898,7 +2900,7 @@ static struct sk_buff *fill_packet_ipv4(struct net_device *odev,
if (!(pkt_dev->flags & F_UDPCSUM)) {
skb->ip_summed = CHECKSUM_NONE;
- } else if (odev->features & NETIF_F_V4_CSUM) {
+ } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM)) {
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum = 0;
udp4_hwcsum(skb, iph->saddr, iph->daddr);
@@ -2981,7 +2983,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
*vlan_encapsulated_proto = htons(ETH_P_IPV6);
}
- skb_set_mac_header(skb, 0);
+ skb_reset_mac_header(skb);
skb_set_network_header(skb, skb->len);
iph = (struct ipv6hdr *) skb_put(skb, sizeof(struct ipv6hdr));
@@ -3032,7 +3034,7 @@ static struct sk_buff *fill_packet_ipv6(struct net_device *odev,
if (!(pkt_dev->flags & F_UDPCSUM)) {
skb->ip_summed = CHECKSUM_NONE;
- } else if (odev->features & NETIF_F_V6_CSUM) {
+ } else if (odev->features & (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM)) {
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = skb_transport_header(skb) - skb->head;
skb->csum_offset = offsetof(struct udphdr, check);
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 504bd17b7456..a75f7e94b445 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -804,6 +804,8 @@ static void copy_rtnl_link_stats(struct rtnl_link_stats *a,
a->rx_compressed = b->rx_compressed;
a->tx_compressed = b->tx_compressed;
+
+ a->rx_nohandler = b->rx_nohandler;
}
static void copy_rtnl_link_stats64(void *v, const struct rtnl_link_stats64 *b)
@@ -893,6 +895,8 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ nla_total_size(4) /* IFLA_PROMISCUITY */
+ nla_total_size(4) /* IFLA_NUM_TX_QUEUES */
+ nla_total_size(4) /* IFLA_NUM_RX_QUEUES */
+ + nla_total_size(4) /* IFLA_MAX_GSO_SEGS */
+ + nla_total_size(4) /* IFLA_MAX_GSO_SIZE */
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(1) /* IFLA_LINKMODE */
+ nla_total_size(4) /* IFLA_CARRIER_CHANGES */
@@ -905,6 +909,7 @@ static noinline size_t if_nlmsg_size(const struct net_device *dev,
+ rtnl_link_get_af_size(dev, ext_filter_mask) /* IFLA_AF_SPEC */
+ nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_PORT_ID */
+ nla_total_size(MAX_PHYS_ITEM_ID_LEN) /* IFLA_PHYS_SWITCH_ID */
+ + nla_total_size(IFNAMSIZ) /* IFLA_PHYS_PORT_NAME */
+ nla_total_size(1); /* IFLA_PROTO_DOWN */
}
@@ -1027,6 +1032,7 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
{
int err;
struct switchdev_attr attr = {
+ .orig_dev = dev,
.id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
.flags = SWITCHDEV_F_NO_RECURSE,
};
@@ -1045,15 +1051,156 @@ static int rtnl_phys_switch_id_fill(struct sk_buff *skb, struct net_device *dev)
return 0;
}
+static noinline_for_stack int rtnl_fill_stats(struct sk_buff *skb,
+ struct net_device *dev)
+{
+ const struct rtnl_link_stats64 *stats;
+ struct rtnl_link_stats64 temp;
+ struct nlattr *attr;
+
+ stats = dev_get_stats(dev, &temp);
+
+ attr = nla_reserve(skb, IFLA_STATS,
+ sizeof(struct rtnl_link_stats));
+ if (!attr)
+ return -EMSGSIZE;
+
+ copy_rtnl_link_stats(nla_data(attr), stats);
+
+ attr = nla_reserve(skb, IFLA_STATS64,
+ sizeof(struct rtnl_link_stats64));
+ if (!attr)
+ return -EMSGSIZE;
+
+ copy_rtnl_link_stats64(nla_data(attr), stats);
+
+ return 0;
+}
+
+static noinline_for_stack int rtnl_fill_vfinfo(struct sk_buff *skb,
+ struct net_device *dev,
+ int vfs_num,
+ struct nlattr *vfinfo)
+{
+ struct ifla_vf_rss_query_en vf_rss_query_en;
+ struct ifla_vf_link_state vf_linkstate;
+ struct ifla_vf_spoofchk vf_spoofchk;
+ struct ifla_vf_tx_rate vf_tx_rate;
+ struct ifla_vf_stats vf_stats;
+ struct ifla_vf_trust vf_trust;
+ struct ifla_vf_vlan vf_vlan;
+ struct ifla_vf_rate vf_rate;
+ struct nlattr *vf, *vfstats;
+ struct ifla_vf_mac vf_mac;
+ struct ifla_vf_info ivi;
+
+ /* Not all SR-IOV capable drivers support the
+ * spoofcheck and "RSS query enable" query. Preset to
+ * -1 so the user space tool can detect that the driver
+ * didn't report anything.
+ */
+ ivi.spoofchk = -1;
+ ivi.rss_query_en = -1;
+ ivi.trusted = -1;
+ memset(ivi.mac, 0, sizeof(ivi.mac));
+ /* The default value for VF link state is "auto"
+ * IFLA_VF_LINK_STATE_AUTO which equals zero
+ */
+ ivi.linkstate = 0;
+ if (dev->netdev_ops->ndo_get_vf_config(dev, vfs_num, &ivi))
+ return 0;
+
+ vf_mac.vf =
+ vf_vlan.vf =
+ vf_rate.vf =
+ vf_tx_rate.vf =
+ vf_spoofchk.vf =
+ vf_linkstate.vf =
+ vf_rss_query_en.vf =
+ vf_trust.vf = ivi.vf;
+
+ memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
+ vf_vlan.vlan = ivi.vlan;
+ vf_vlan.qos = ivi.qos;
+ vf_tx_rate.rate = ivi.max_tx_rate;
+ vf_rate.min_tx_rate = ivi.min_tx_rate;
+ vf_rate.max_tx_rate = ivi.max_tx_rate;
+ vf_spoofchk.setting = ivi.spoofchk;
+ vf_linkstate.link_state = ivi.linkstate;
+ vf_rss_query_en.setting = ivi.rss_query_en;
+ vf_trust.setting = ivi.trusted;
+ vf = nla_nest_start(skb, IFLA_VF_INFO);
+ if (!vf) {
+ nla_nest_cancel(skb, vfinfo);
+ return -EMSGSIZE;
+ }
+ if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
+ nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
+ nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
+ &vf_rate) ||
+ nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
+ &vf_tx_rate) ||
+ nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
+ &vf_spoofchk) ||
+ nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate),
+ &vf_linkstate) ||
+ nla_put(skb, IFLA_VF_RSS_QUERY_EN,
+ sizeof(vf_rss_query_en),
+ &vf_rss_query_en) ||
+ nla_put(skb, IFLA_VF_TRUST,
+ sizeof(vf_trust), &vf_trust))
+ return -EMSGSIZE;
+ memset(&vf_stats, 0, sizeof(vf_stats));
+ if (dev->netdev_ops->ndo_get_vf_stats)
+ dev->netdev_ops->ndo_get_vf_stats(dev, vfs_num,
+ &vf_stats);
+ vfstats = nla_nest_start(skb, IFLA_VF_STATS);
+ if (!vfstats) {
+ nla_nest_cancel(skb, vf);
+ nla_nest_cancel(skb, vfinfo);
+ return -EMSGSIZE;
+ }
+ if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS,
+ vf_stats.rx_packets) ||
+ nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS,
+ vf_stats.tx_packets) ||
+ nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES,
+ vf_stats.rx_bytes) ||
+ nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES,
+ vf_stats.tx_bytes) ||
+ nla_put_u64(skb, IFLA_VF_STATS_BROADCAST,
+ vf_stats.broadcast) ||
+ nla_put_u64(skb, IFLA_VF_STATS_MULTICAST,
+ vf_stats.multicast))
+ return -EMSGSIZE;
+ nla_nest_end(skb, vfstats);
+ nla_nest_end(skb, vf);
+ return 0;
+}
+
+static int rtnl_fill_link_ifmap(struct sk_buff *skb, struct net_device *dev)
+{
+ struct rtnl_link_ifmap map = {
+ .mem_start = dev->mem_start,
+ .mem_end = dev->mem_end,
+ .base_addr = dev->base_addr,
+ .irq = dev->irq,
+ .dma = dev->dma,
+ .port = dev->if_port,
+ };
+ if (nla_put(skb, IFLA_MAP, sizeof(map), &map))
+ return -EMSGSIZE;
+
+ return 0;
+}
+
static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
int type, u32 pid, u32 seq, u32 change,
unsigned int flags, u32 ext_filter_mask)
{
struct ifinfomsg *ifm;
struct nlmsghdr *nlh;
- struct rtnl_link_stats64 temp;
- const struct rtnl_link_stats64 *stats;
- struct nlattr *attr, *af_spec;
+ struct nlattr *af_spec;
struct rtnl_af_ops *af_ops;
struct net_device *upper_dev = netdev_master_upper_dev_get(dev);
@@ -1079,6 +1226,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
nla_put_u32(skb, IFLA_GROUP, dev->group) ||
nla_put_u32(skb, IFLA_PROMISCUITY, dev->promiscuity) ||
nla_put_u32(skb, IFLA_NUM_TX_QUEUES, dev->num_tx_queues) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SEGS, dev->gso_max_segs) ||
+ nla_put_u32(skb, IFLA_GSO_MAX_SIZE, dev->gso_max_size) ||
#ifdef CONFIG_RPS
nla_put_u32(skb, IFLA_NUM_RX_QUEUES, dev->num_rx_queues) ||
#endif
@@ -1096,18 +1245,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
nla_put_u8(skb, IFLA_PROTO_DOWN, dev->proto_down))
goto nla_put_failure;
- if (1) {
- struct rtnl_link_ifmap map = {
- .mem_start = dev->mem_start,
- .mem_end = dev->mem_end,
- .base_addr = dev->base_addr,
- .irq = dev->irq,
- .dma = dev->dma,
- .port = dev->if_port,
- };
- if (nla_put(skb, IFLA_MAP, sizeof(map), &map))
- goto nla_put_failure;
- }
+ if (rtnl_fill_link_ifmap(skb, dev))
+ goto nla_put_failure;
if (dev->addr_len) {
if (nla_put(skb, IFLA_ADDRESS, dev->addr_len, dev->dev_addr) ||
@@ -1124,128 +1263,27 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct net_device *dev,
if (rtnl_phys_switch_id_fill(skb, dev))
goto nla_put_failure;
- attr = nla_reserve(skb, IFLA_STATS,
- sizeof(struct rtnl_link_stats));
- if (attr == NULL)
+ if (rtnl_fill_stats(skb, dev))
goto nla_put_failure;
- stats = dev_get_stats(dev, &temp);
- copy_rtnl_link_stats(nla_data(attr), stats);
-
- attr = nla_reserve(skb, IFLA_STATS64,
- sizeof(struct rtnl_link_stats64));
- if (attr == NULL)
- goto nla_put_failure;
- copy_rtnl_link_stats64(nla_data(attr), stats);
-
if (dev->dev.parent && (ext_filter_mask & RTEXT_FILTER_VF) &&
nla_put_u32(skb, IFLA_NUM_VF, dev_num_vf(dev->dev.parent)))
goto nla_put_failure;
- if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent
- && (ext_filter_mask & RTEXT_FILTER_VF)) {
+ if (dev->netdev_ops->ndo_get_vf_config && dev->dev.parent &&
+ ext_filter_mask & RTEXT_FILTER_VF) {
int i;
-
- struct nlattr *vfinfo, *vf, *vfstats;
+ struct nlattr *vfinfo;
int num_vfs = dev_num_vf(dev->dev.parent);
vfinfo = nla_nest_start(skb, IFLA_VFINFO_LIST);
if (!vfinfo)
goto nla_put_failure;
for (i = 0; i < num_vfs; i++) {
- struct ifla_vf_info ivi;
- struct ifla_vf_mac vf_mac;
- struct ifla_vf_vlan vf_vlan;
- struct ifla_vf_rate vf_rate;
- struct ifla_vf_tx_rate vf_tx_rate;
- struct ifla_vf_spoofchk vf_spoofchk;
- struct ifla_vf_link_state vf_linkstate;
- struct ifla_vf_rss_query_en vf_rss_query_en;
- struct ifla_vf_stats vf_stats;
- struct ifla_vf_trust vf_trust;
-
- /*
- * Not all SR-IOV capable drivers support the
- * spoofcheck and "RSS query enable" query. Preset to
- * -1 so the user space tool can detect that the driver
- * didn't report anything.
- */
- ivi.spoofchk = -1;
- ivi.rss_query_en = -1;
- ivi.trusted = -1;
- memset(ivi.mac, 0, sizeof(ivi.mac));
- /* The default value for VF link state is "auto"
- * IFLA_VF_LINK_STATE_AUTO which equals zero
- */
- ivi.linkstate = 0;
- if (dev->netdev_ops->ndo_get_vf_config(dev, i, &ivi))
- break;
- vf_mac.vf =
- vf_vlan.vf =
- vf_rate.vf =
- vf_tx_rate.vf =
- vf_spoofchk.vf =
- vf_linkstate.vf =
- vf_rss_query_en.vf =
- vf_trust.vf = ivi.vf;
-
- memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
- vf_vlan.vlan = ivi.vlan;
- vf_vlan.qos = ivi.qos;
- vf_tx_rate.rate = ivi.max_tx_rate;
- vf_rate.min_tx_rate = ivi.min_tx_rate;
- vf_rate.max_tx_rate = ivi.max_tx_rate;
- vf_spoofchk.setting = ivi.spoofchk;
- vf_linkstate.link_state = ivi.linkstate;
- vf_rss_query_en.setting = ivi.rss_query_en;
- vf_trust.setting = ivi.trusted;
- vf = nla_nest_start(skb, IFLA_VF_INFO);
- if (!vf) {
- nla_nest_cancel(skb, vfinfo);
- goto nla_put_failure;
- }
- if (nla_put(skb, IFLA_VF_MAC, sizeof(vf_mac), &vf_mac) ||
- nla_put(skb, IFLA_VF_VLAN, sizeof(vf_vlan), &vf_vlan) ||
- nla_put(skb, IFLA_VF_RATE, sizeof(vf_rate),
- &vf_rate) ||
- nla_put(skb, IFLA_VF_TX_RATE, sizeof(vf_tx_rate),
- &vf_tx_rate) ||
- nla_put(skb, IFLA_VF_SPOOFCHK, sizeof(vf_spoofchk),
- &vf_spoofchk) ||
- nla_put(skb, IFLA_VF_LINK_STATE, sizeof(vf_linkstate),
- &vf_linkstate) ||
- nla_put(skb, IFLA_VF_RSS_QUERY_EN,
- sizeof(vf_rss_query_en),
- &vf_rss_query_en) ||
- nla_put(skb, IFLA_VF_TRUST,
- sizeof(vf_trust), &vf_trust))
+ if (rtnl_fill_vfinfo(skb, dev, i, vfinfo))
goto nla_put_failure;
- memset(&vf_stats, 0, sizeof(vf_stats));
- if (dev->netdev_ops->ndo_get_vf_stats)
- dev->netdev_ops->ndo_get_vf_stats(dev, i,
- &vf_stats);
- vfstats = nla_nest_start(skb, IFLA_VF_STATS);
- if (!vfstats) {
- nla_nest_cancel(skb, vf);
- nla_nest_cancel(skb, vfinfo);
- goto nla_put_failure;
- }
- if (nla_put_u64(skb, IFLA_VF_STATS_RX_PACKETS,
- vf_stats.rx_packets) ||
- nla_put_u64(skb, IFLA_VF_STATS_TX_PACKETS,
- vf_stats.tx_packets) ||
- nla_put_u64(skb, IFLA_VF_STATS_RX_BYTES,
- vf_stats.rx_bytes) ||
- nla_put_u64(skb, IFLA_VF_STATS_TX_BYTES,
- vf_stats.tx_bytes) ||
- nla_put_u64(skb, IFLA_VF_STATS_BROADCAST,
- vf_stats.broadcast) ||
- nla_put_u64(skb, IFLA_VF_STATS_MULTICAST,
- vf_stats.multicast))
- goto nla_put_failure;
- nla_nest_end(skb, vfstats);
- nla_nest_end(skb, vf);
}
+
nla_nest_end(skb, vfinfo);
}
@@ -1356,15 +1394,8 @@ static const struct nla_policy ifla_vf_policy[IFLA_VF_MAX+1] = {
[IFLA_VF_RSS_QUERY_EN] = { .len = sizeof(struct ifla_vf_rss_query_en) },
[IFLA_VF_STATS] = { .type = NLA_NESTED },
[IFLA_VF_TRUST] = { .len = sizeof(struct ifla_vf_trust) },
-};
-
-static const struct nla_policy ifla_vf_stats_policy[IFLA_VF_STATS_MAX + 1] = {
- [IFLA_VF_STATS_RX_PACKETS] = { .type = NLA_U64 },
- [IFLA_VF_STATS_TX_PACKETS] = { .type = NLA_U64 },
- [IFLA_VF_STATS_RX_BYTES] = { .type = NLA_U64 },
- [IFLA_VF_STATS_TX_BYTES] = { .type = NLA_U64 },
- [IFLA_VF_STATS_BROADCAST] = { .type = NLA_U64 },
- [IFLA_VF_STATS_MULTICAST] = { .type = NLA_U64 },
+ [IFLA_VF_IB_NODE_GUID] = { .len = sizeof(struct ifla_vf_guid) },
+ [IFLA_VF_IB_PORT_GUID] = { .len = sizeof(struct ifla_vf_guid) },
};
static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
@@ -1381,6 +1412,58 @@ static const struct nla_policy ifla_port_policy[IFLA_PORT_MAX+1] = {
[IFLA_PORT_RESPONSE] = { .type = NLA_U16, },
};
+static const struct rtnl_link_ops *linkinfo_to_kind_ops(const struct nlattr *nla)
+{
+ const struct rtnl_link_ops *ops = NULL;
+ struct nlattr *linfo[IFLA_INFO_MAX + 1];
+
+ if (nla_parse_nested(linfo, IFLA_INFO_MAX, nla, ifla_info_policy) < 0)
+ return NULL;
+
+ if (linfo[IFLA_INFO_KIND]) {
+ char kind[MODULE_NAME_LEN];
+
+ nla_strlcpy(kind, linfo[IFLA_INFO_KIND], sizeof(kind));
+ ops = rtnl_link_ops_get(kind);
+ }
+
+ return ops;
+}
+
+static bool link_master_filtered(struct net_device *dev, int master_idx)
+{
+ struct net_device *master;
+
+ if (!master_idx)
+ return false;
+
+ master = netdev_master_upper_dev_get(dev);
+ if (!master || master->ifindex != master_idx)
+ return true;
+
+ return false;
+}
+
+static bool link_kind_filtered(const struct net_device *dev,
+ const struct rtnl_link_ops *kind_ops)
+{
+ if (kind_ops && dev->rtnl_link_ops != kind_ops)
+ return true;
+
+ return false;
+}
+
+static bool link_dump_filtered(struct net_device *dev,
+ int master_idx,
+ const struct rtnl_link_ops *kind_ops)
+{
+ if (link_master_filtered(dev, master_idx) ||
+ link_kind_filtered(dev, kind_ops))
+ return true;
+
+ return false;
+}
+
static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
{
struct net *net = sock_net(skb->sk);
@@ -1390,6 +1473,9 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
struct hlist_head *head;
struct nlattr *tb[IFLA_MAX+1];
u32 ext_filter_mask = 0;
+ const struct rtnl_link_ops *kind_ops = NULL;
+ unsigned int flags = NLM_F_MULTI;
+ int master_idx = 0;
int err;
int hdrlen;
@@ -1412,18 +1498,29 @@ static int rtnl_dump_ifinfo(struct sk_buff *skb, struct netlink_callback *cb)
if (tb[IFLA_EXT_MASK])
ext_filter_mask = nla_get_u32(tb[IFLA_EXT_MASK]);
+
+ if (tb[IFLA_MASTER])
+ master_idx = nla_get_u32(tb[IFLA_MASTER]);
+
+ if (tb[IFLA_LINKINFO])
+ kind_ops = linkinfo_to_kind_ops(tb[IFLA_LINKINFO]);
+
+ if (master_idx || kind_ops)
+ flags |= NLM_F_DUMP_FILTERED;
}
for (h = s_h; h < NETDEV_HASHENTRIES; h++, s_idx = 0) {
idx = 0;
head = &net->dev_index_head[h];
hlist_for_each_entry(dev, head, index_hlist) {
+ if (link_dump_filtered(dev, master_idx, kind_ops))
+ continue;
if (idx < s_idx)
goto cont;
err = rtnl_fill_ifinfo(skb, dev, RTM_NEWLINK,
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, 0,
- NLM_F_MULTI,
+ flags,
ext_filter_mask);
/* If we ran out of room on the first message,
* we're in trouble
@@ -1503,6 +1600,22 @@ static int validate_linkmsg(struct net_device *dev, struct nlattr *tb[])
return 0;
}
+static int handle_infiniband_guid(struct net_device *dev, struct ifla_vf_guid *ivt,
+ int guid_type)
+{
+ const struct net_device_ops *ops = dev->netdev_ops;
+
+ return ops->ndo_set_vf_guid(dev, ivt->vf, ivt->guid, guid_type);
+}
+
+static int handle_vf_guid(struct net_device *dev, struct ifla_vf_guid *ivt, int guid_type)
+{
+ if (dev->type != ARPHRD_INFINIBAND)
+ return -EOPNOTSUPP;
+
+ return handle_infiniband_guid(dev, ivt, guid_type);
+}
+
static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
{
const struct net_device_ops *ops = dev->netdev_ops;
@@ -1605,6 +1718,24 @@ static int do_setvfinfo(struct net_device *dev, struct nlattr **tb)
return err;
}
+ if (tb[IFLA_VF_IB_NODE_GUID]) {
+ struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_NODE_GUID]);
+
+ if (!ops->ndo_set_vf_guid)
+ return -EOPNOTSUPP;
+
+ return handle_vf_guid(dev, ivt, IFLA_VF_IB_NODE_GUID);
+ }
+
+ if (tb[IFLA_VF_IB_PORT_GUID]) {
+ struct ifla_vf_guid *ivt = nla_data(tb[IFLA_VF_IB_PORT_GUID]);
+
+ if (!ops->ndo_set_vf_guid)
+ return -EOPNOTSUPP;
+
+ return handle_vf_guid(dev, ivt, IFLA_VF_IB_PORT_GUID);
+ }
+
return err;
}
@@ -2533,7 +2664,7 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
struct net_device *dev,
u8 *addr, u16 vid, u32 pid, u32 seq,
int type, unsigned int flags,
- int nlflags)
+ int nlflags, u16 ndm_state)
{
struct nlmsghdr *nlh;
struct ndmsg *ndm;
@@ -2549,7 +2680,7 @@ static int nlmsg_populate_fdb_fill(struct sk_buff *skb,
ndm->ndm_flags = flags;
ndm->ndm_type = 0;
ndm->ndm_ifindex = dev->ifindex;
- ndm->ndm_state = NUD_PERMANENT;
+ ndm->ndm_state = ndm_state;
if (nla_put(skb, NDA_LLADDR, ETH_ALEN, addr))
goto nla_put_failure;
@@ -2570,7 +2701,8 @@ static inline size_t rtnl_fdb_nlmsg_size(void)
return NLMSG_ALIGN(sizeof(struct ndmsg)) + nla_total_size(ETH_ALEN);
}
-static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type)
+static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type,
+ u16 ndm_state)
{
struct net *net = dev_net(dev);
struct sk_buff *skb;
@@ -2581,7 +2713,7 @@ static void rtnl_fdb_notify(struct net_device *dev, u8 *addr, u16 vid, int type)
goto errout;
err = nlmsg_populate_fdb_fill(skb, dev, addr, vid,
- 0, 0, type, NTF_SELF, 0);
+ 0, 0, type, NTF_SELF, 0, ndm_state);
if (err < 0) {
kfree_skb(skb);
goto errout;
@@ -2716,7 +2848,8 @@ static int rtnl_fdb_add(struct sk_buff *skb, struct nlmsghdr *nlh)
nlh->nlmsg_flags);
if (!err) {
- rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH);
+ rtnl_fdb_notify(dev, addr, vid, RTM_NEWNEIGH,
+ ndm->ndm_state);
ndm->ndm_flags &= ~NTF_SELF;
}
}
@@ -2817,7 +2950,8 @@ static int rtnl_fdb_del(struct sk_buff *skb, struct nlmsghdr *nlh)
err = ndo_dflt_fdb_del(ndm, tb, dev, addr, vid);
if (!err) {
- rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH);
+ rtnl_fdb_notify(dev, addr, vid, RTM_DELNEIGH,
+ ndm->ndm_state);
ndm->ndm_flags &= ~NTF_SELF;
}
}
@@ -2845,7 +2979,7 @@ static int nlmsg_populate_fdb(struct sk_buff *skb,
err = nlmsg_populate_fdb_fill(skb, dev, ha->addr, 0,
portid, seq,
RTM_NEWNEIGH, NTF_SELF,
- NLM_F_MULTI);
+ NLM_F_MULTI, NUD_PERMANENT);
if (err < 0)
return err;
skip:
@@ -2877,6 +3011,7 @@ int ndo_dflt_fdb_dump(struct sk_buff *skb,
nlmsg_populate_fdb(skb, cb, dev, &idx, &dev->mc);
out:
netif_addr_unlock_bh(dev);
+ cb->args[1] = err;
return idx;
}
EXPORT_SYMBOL(ndo_dflt_fdb_dump);
@@ -2910,6 +3045,7 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
ops = br_dev->netdev_ops;
}
+ cb->args[1] = 0;
for_each_netdev(net, dev) {
if (brport_idx && (dev->ifindex != brport_idx))
continue;
@@ -2937,12 +3073,16 @@ static int rtnl_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb)
idx = cops->ndo_fdb_dump(skb, cb, br_dev, dev,
idx);
}
+ if (cb->args[1] == -EMSGSIZE)
+ break;
if (dev->netdev_ops->ndo_fdb_dump)
idx = dev->netdev_ops->ndo_fdb_dump(skb, cb, dev, NULL,
idx);
else
idx = ndo_dflt_fdb_dump(skb, cb, dev, NULL, idx);
+ if (cb->args[1] == -EMSGSIZE)
+ break;
cops = NULL;
}
@@ -3317,7 +3457,7 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
{
struct net *net = sock_net(skb->sk);
rtnl_doit_func doit;
- int sz_idx, kind;
+ int kind;
int family;
int type;
int err;
@@ -3333,7 +3473,6 @@ static int rtnetlink_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return 0;
family = ((struct rtgenmsg *)nlmsg_data(nlh))->rtgen_family;
- sz_idx = type>>2;
kind = type&3;
if (kind != 2 && !netlink_net_capable(skb, CAP_NET_ADMIN))
diff --git a/net/core/scm.c b/net/core/scm.c
index 3b6899b7d810..2696aefdc148 100644
--- a/net/core/scm.c
+++ b/net/core/scm.c
@@ -87,6 +87,7 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
*fplp = fpl;
fpl->count = 0;
fpl->max = SCM_MAX_FD;
+ fpl->user = NULL;
}
fpp = &fpl->fp[fpl->count];
@@ -107,6 +108,10 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
*fpp++ = file;
fpl->count++;
}
+
+ if (!fpl->user)
+ fpl->user = get_uid(current_user());
+
return num;
}
@@ -119,6 +124,7 @@ void __scm_destroy(struct scm_cookie *scm)
scm->fp = NULL;
for (i=fpl->count-1; i>=0; i--)
fput(fpl->fp[i]);
+ free_uid(fpl->user);
kfree(fpl);
}
}
@@ -289,8 +295,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
/* Bump the usage count and install the file. */
sock = sock_from_file(fp[i], &err);
if (sock) {
- sock_update_netprioidx(sock->sk);
- sock_update_classid(sock->sk);
+ sock_update_netprioidx(&sock->sk->sk_cgrp_data);
+ sock_update_classid(&sock->sk->sk_cgrp_data);
}
fd_install(new_fd, get_file(fp[i]));
}
@@ -305,6 +311,8 @@ void scm_detach_fds(struct msghdr *msg, struct scm_cookie *scm)
err = put_user(cmlen, &cm->cmsg_len);
if (!err) {
cmlen = CMSG_SPACE(i*sizeof(int));
+ if (msg->msg_controllen < cmlen)
+ cmlen = msg->msg_controllen;
msg->msg_control += cmlen;
msg->msg_controllen -= cmlen;
}
@@ -334,6 +342,7 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
new_fpl->max = new_fpl->count;
+ new_fpl->user = get_uid(fpl->user);
}
return new_fpl;
}
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fab4599ba8b2..d04c2d1c8c87 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -79,6 +79,8 @@
struct kmem_cache *skbuff_head_cache __read_mostly;
static struct kmem_cache *skbuff_fclone_cache __read_mostly;
+int sysctl_max_skb_frags __read_mostly = MAX_SKB_FRAGS;
+EXPORT_SYMBOL(sysctl_max_skb_frags);
/**
* skb_panic - private function for out-of-line support
@@ -347,8 +349,16 @@ struct sk_buff *build_skb(void *data, unsigned int frag_size)
}
EXPORT_SYMBOL(build_skb);
+#define NAPI_SKB_CACHE_SIZE 64
+
+struct napi_alloc_cache {
+ struct page_frag_cache page;
+ size_t skb_count;
+ void *skb_cache[NAPI_SKB_CACHE_SIZE];
+};
+
static DEFINE_PER_CPU(struct page_frag_cache, netdev_alloc_cache);
-static DEFINE_PER_CPU(struct page_frag_cache, napi_alloc_cache);
+static DEFINE_PER_CPU(struct napi_alloc_cache, napi_alloc_cache);
static void *__netdev_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
@@ -378,9 +388,9 @@ EXPORT_SYMBOL(netdev_alloc_frag);
static void *__napi_alloc_frag(unsigned int fragsz, gfp_t gfp_mask)
{
- struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
- return __alloc_page_frag(nc, fragsz, gfp_mask);
+ return __alloc_page_frag(&nc->page, fragsz, gfp_mask);
}
void *napi_alloc_frag(unsigned int fragsz)
@@ -414,7 +424,7 @@ struct sk_buff *__netdev_alloc_skb(struct net_device *dev, unsigned int len,
len += NET_SKB_PAD;
if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
- (gfp_mask & (__GFP_WAIT | GFP_DMA))) {
+ (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
if (!skb)
goto skb_fail;
@@ -474,14 +484,14 @@ EXPORT_SYMBOL(__netdev_alloc_skb);
struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
gfp_t gfp_mask)
{
- struct page_frag_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
struct sk_buff *skb;
void *data;
len += NET_SKB_PAD + NET_IP_ALIGN;
if ((len > SKB_WITH_OVERHEAD(PAGE_SIZE)) ||
- (gfp_mask & (__GFP_WAIT | GFP_DMA))) {
+ (gfp_mask & (__GFP_DIRECT_RECLAIM | GFP_DMA))) {
skb = __alloc_skb(len, gfp_mask, SKB_ALLOC_RX, NUMA_NO_NODE);
if (!skb)
goto skb_fail;
@@ -494,7 +504,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
if (sk_memalloc_socks())
gfp_mask |= __GFP_MEMALLOC;
- data = __alloc_page_frag(nc, len, gfp_mask);
+ data = __alloc_page_frag(&nc->page, len, gfp_mask);
if (unlikely(!data))
return NULL;
@@ -505,7 +515,7 @@ struct sk_buff *__napi_alloc_skb(struct napi_struct *napi, unsigned int len,
}
/* use OR instead of assignment to avoid clearing of bits in mask */
- if (nc->pfmemalloc)
+ if (nc->page.pfmemalloc)
skb->pfmemalloc = 1;
skb->head_frag = 1;
@@ -747,6 +757,73 @@ void consume_skb(struct sk_buff *skb)
}
EXPORT_SYMBOL(consume_skb);
+void __kfree_skb_flush(void)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+ /* flush skb_cache if containing objects */
+ if (nc->skb_count) {
+ kmem_cache_free_bulk(skbuff_head_cache, nc->skb_count,
+ nc->skb_cache);
+ nc->skb_count = 0;
+ }
+}
+
+static inline void _kfree_skb_defer(struct sk_buff *skb)
+{
+ struct napi_alloc_cache *nc = this_cpu_ptr(&napi_alloc_cache);
+
+ /* drop skb->head and call any destructors for packet */
+ skb_release_all(skb);
+
+ /* record skb to CPU local list */
+ nc->skb_cache[nc->skb_count++] = skb;
+
+#ifdef CONFIG_SLUB
+ /* SLUB writes into objects when freeing */
+ prefetchw(skb);
+#endif
+
+ /* flush skb_cache if it is filled */
+ if (unlikely(nc->skb_count == NAPI_SKB_CACHE_SIZE)) {
+ kmem_cache_free_bulk(skbuff_head_cache, NAPI_SKB_CACHE_SIZE,
+ nc->skb_cache);
+ nc->skb_count = 0;
+ }
+}
+void __kfree_skb_defer(struct sk_buff *skb)
+{
+ _kfree_skb_defer(skb);
+}
+
+void napi_consume_skb(struct sk_buff *skb, int budget)
+{
+ if (unlikely(!skb))
+ return;
+
+ /* Zero budget indicate non-NAPI context called us, like netpoll */
+ if (unlikely(!budget)) {
+ dev_consume_skb_any(skb);
+ return;
+ }
+
+ if (likely(atomic_read(&skb->users) == 1))
+ smp_rmb();
+ else if (likely(!atomic_dec_and_test(&skb->users)))
+ return;
+ /* if reaching here SKB is ready to free */
+ trace_consume_skb(skb);
+
+ /* if SKB is a clone, don't handle this case */
+ if (skb->fclone != SKB_FCLONE_UNAVAILABLE) {
+ __kfree_skb(skb);
+ return;
+ }
+
+ _kfree_skb_defer(skb);
+}
+EXPORT_SYMBOL(napi_consume_skb);
+
/* Make sure a field is enclosed inside headers_start/headers_end section */
#define CHECK_SKB_FIELD(field) \
BUILD_BUG_ON(offsetof(struct sk_buff, field) < \
@@ -1841,6 +1918,7 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
struct splice_pipe_desc *spd, struct sock *sk)
{
int seg;
+ struct sk_buff *iter;
/* map the linear part :
* If skb->head_frag is set, this 'linear' part is backed by a
@@ -1867,6 +1945,19 @@ static bool __skb_splice_bits(struct sk_buff *skb, struct pipe_inode_info *pipe,
return true;
}
+ skb_walk_frags(skb, iter) {
+ if (*offset >= iter->len) {
+ *offset -= iter->len;
+ continue;
+ }
+ /* __skb_splice_bits() only fails if the output has no room
+ * left, so no point in going over the frag_list for the error
+ * case.
+ */
+ if (__skb_splice_bits(iter, pipe, offset, len, spd, sk))
+ return true;
+ }
+
return false;
}
@@ -1893,9 +1984,7 @@ ssize_t skb_socket_splice(struct sock *sk,
/*
* Map data from the skb to a pipe. Should handle both the linear part,
- * the fragments, and the frag list. It does NOT handle frag lists within
- * the frag list, if such a thing exists. We'd probably need to recurse to
- * handle that cleanly.
+ * the fragments, and the frag list.
*/
int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
struct pipe_inode_info *pipe, unsigned int tlen,
@@ -1914,29 +2003,10 @@ int skb_splice_bits(struct sk_buff *skb, struct sock *sk, unsigned int offset,
.ops = &nosteal_pipe_buf_ops,
.spd_release = sock_spd_release,
};
- struct sk_buff *frag_iter;
int ret = 0;
- /*
- * __skb_splice_bits() only fails if the output has no room left,
- * so no point in going over the frag_list for the error case.
- */
- if (__skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk))
- goto done;
- else if (!tlen)
- goto done;
-
- /*
- * now see if we have a frag_list to map
- */
- skb_walk_frags(skb, frag_iter) {
- if (!tlen)
- break;
- if (__skb_splice_bits(frag_iter, pipe, &offset, &tlen, &spd, sk))
- break;
- }
+ __skb_splice_bits(skb, pipe, &offset, &tlen, &spd, sk);
-done:
if (spd.nr_pages)
ret = splice_cb(sk, pipe, &spd);
@@ -2946,6 +3016,24 @@ int skb_append_pagefrags(struct sk_buff *skb, struct page *page,
EXPORT_SYMBOL_GPL(skb_append_pagefrags);
/**
+ * skb_push_rcsum - push skb and update receive checksum
+ * @skb: buffer to update
+ * @len: length of data pulled
+ *
+ * This function performs an skb_push on the packet and updates
+ * the CHECKSUM_COMPLETE checksum. It should be used on
+ * receive path processing instead of skb_push unless you know
+ * that the checksum difference is zero (e.g., a valid IP header)
+ * or you are setting ip_summed to CHECKSUM_NONE.
+ */
+static unsigned char *skb_push_rcsum(struct sk_buff *skb, unsigned len)
+{
+ skb_push(skb, len);
+ skb_postpush_rcsum(skb, skb->data, len);
+ return skb->data;
+}
+
+/**
* skb_pull_rcsum - pull skb and update receive checksum
* @skb: buffer to update
* @len: length of data pulled
@@ -3004,8 +3092,7 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
if (unlikely(!proto))
return ERR_PTR(-EINVAL);
- csum = !head_skb->encap_hdr_csum &&
- !!can_checksum_protocol(features, proto);
+ csum = !!can_checksum_protocol(features, proto);
headroom = skb_headroom(head_skb);
pos = skb_headlen(head_skb);
@@ -3098,13 +3185,15 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
if (nskb->len == len + doffset)
goto perform_csum_check;
- if (!sg && !nskb->remcsum_offload) {
- nskb->ip_summed = CHECKSUM_NONE;
- nskb->csum = skb_copy_and_csum_bits(head_skb, offset,
- skb_put(nskb, len),
- len, 0);
+ if (!sg) {
+ if (!nskb->remcsum_offload)
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum =
+ skb_copy_and_csum_bits(head_skb, offset,
+ skb_put(nskb, len),
+ len, 0);
SKB_GSO_CB(nskb)->csum_start =
- skb_headroom(nskb) + doffset;
+ skb_headroom(nskb) + doffset;
continue;
}
@@ -3170,12 +3259,19 @@ skip_fraglist:
nskb->truesize += nskb->data_len;
perform_csum_check:
- if (!csum && !nskb->remcsum_offload) {
- nskb->csum = skb_checksum(nskb, doffset,
- nskb->len - doffset, 0);
- nskb->ip_summed = CHECKSUM_NONE;
+ if (!csum) {
+ if (skb_has_shared_frag(nskb)) {
+ err = __skb_linearize(nskb);
+ if (err)
+ goto err;
+ }
+ if (!nskb->remcsum_offload)
+ nskb->ip_summed = CHECKSUM_NONE;
+ SKB_GSO_CB(nskb)->csum =
+ skb_checksum(nskb, doffset,
+ nskb->len - doffset, 0);
SKB_GSO_CB(nskb)->csum_start =
- skb_headroom(nskb) + doffset;
+ skb_headroom(nskb) + doffset;
}
} while ((offset += len) < head_skb->len);
@@ -3643,7 +3739,8 @@ static void __skb_complete_tx_timestamp(struct sk_buff *skb,
serr->ee.ee_info = tstype;
if (sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID) {
serr->ee.ee_data = skb_shinfo(skb)->tskey;
- if (sk->sk_protocol == IPPROTO_TCP)
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM)
serr->ee.ee_data -= sk->sk_tskey;
}
@@ -4081,9 +4178,9 @@ struct sk_buff *skb_checksum_trimmed(struct sk_buff *skb,
if (!pskb_may_pull(skb_chk, offset))
goto err;
- __skb_pull(skb_chk, offset);
+ skb_pull_rcsum(skb_chk, offset);
ret = skb_chkf(skb_chk);
- __skb_push(skb_chk, offset);
+ skb_push_rcsum(skb_chk, offset);
if (ret)
goto err;
@@ -4216,7 +4313,6 @@ void skb_scrub_packet(struct sk_buff *skb, bool xnet)
skb->skb_iif = 0;
skb->ignore_df = 0;
skb_dst_drop(skb);
- skb_sender_cpu_clear(skb);
secpath_reset(skb);
nf_reset(skb);
nf_reset_trace(skb);
@@ -4268,7 +4364,8 @@ static struct sk_buff *skb_reorder_vlan_header(struct sk_buff *skb)
return NULL;
}
- memmove(skb->data - ETH_HLEN, skb->data - VLAN_ETH_HLEN, 2 * ETH_ALEN);
+ memmove(skb->data - ETH_HLEN, skb->data - skb->mac_len - VLAN_HLEN,
+ 2 * ETH_ALEN);
skb->mac_header += VLAN_HLEN;
return skb;
}
@@ -4411,9 +4508,7 @@ int skb_vlan_push(struct sk_buff *skb, __be16 vlan_proto, u16 vlan_tci)
skb->mac_len += VLAN_HLEN;
__skb_pull(skb, offset);
- if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->csum = csum_add(skb->csum, csum_partial(skb->data
- + (2 * ETH_ALEN), VLAN_HLEN, 0));
+ skb_postpush_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
}
__vlan_hwaccel_put_tag(skb, vlan_proto, vlan_tci);
return 0;
@@ -4452,7 +4547,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
return NULL;
gfp_head = gfp_mask;
- if (gfp_head & __GFP_WAIT)
+ if (gfp_head & __GFP_DIRECT_RECLAIM)
gfp_head |= __GFP_REPEAT;
*errcode = -ENOBUFS;
@@ -4467,7 +4562,7 @@ struct sk_buff *alloc_skb_with_frags(unsigned long header_len,
while (order) {
if (npages >= 1 << order) {
- page = alloc_pages((gfp_mask & ~__GFP_WAIT) |
+ page = alloc_pages((gfp_mask & ~__GFP_DIRECT_RECLAIM) |
__GFP_COMP |
__GFP_NOWARN |
__GFP_NORETRY,
diff --git a/net/core/sock.c b/net/core/sock.c
index 7529eb9463be..7e73c26b6bb4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -134,6 +134,7 @@
#include <linux/sock_diag.h>
#include <linux/filter.h>
+#include <net/sock_reuseport.h>
#include <trace/events/sock.h>
@@ -194,44 +195,6 @@ bool sk_net_capable(const struct sock *sk, int cap)
}
EXPORT_SYMBOL(sk_net_capable);
-
-#ifdef CONFIG_MEMCG_KMEM
-int mem_cgroup_sockets_init(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
- struct proto *proto;
- int ret = 0;
-
- mutex_lock(&proto_list_mutex);
- list_for_each_entry(proto, &proto_list, node) {
- if (proto->init_cgroup) {
- ret = proto->init_cgroup(memcg, ss);
- if (ret)
- goto out;
- }
- }
-
- mutex_unlock(&proto_list_mutex);
- return ret;
-out:
- list_for_each_entry_continue_reverse(proto, &proto_list, node)
- if (proto->destroy_cgroup)
- proto->destroy_cgroup(memcg);
- mutex_unlock(&proto_list_mutex);
- return ret;
-}
-
-void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
-{
- struct proto *proto;
-
- mutex_lock(&proto_list_mutex);
- list_for_each_entry_reverse(proto, &proto_list, node)
- if (proto->destroy_cgroup)
- proto->destroy_cgroup(memcg);
- mutex_unlock(&proto_list_mutex);
-}
-#endif
-
/*
* Each address family might have different locking rules, so we have
* one slock key per address family:
@@ -239,11 +202,6 @@ void mem_cgroup_sockets_destroy(struct mem_cgroup *memcg)
static struct lock_class_key af_family_keys[AF_MAX];
static struct lock_class_key af_family_slock_keys[AF_MAX];
-#if defined(CONFIG_MEMCG_KMEM)
-struct static_key memcg_socket_limit_enabled;
-EXPORT_SYMBOL(memcg_socket_limit_enabled);
-#endif
-
/*
* Make lock validator output more readable. (we pre-construct these
* strings build-time, so that runtime initialization of socket
@@ -263,7 +221,8 @@ static const char *const af_family_key_strings[AF_MAX+1] = {
"sk_lock-AF_TIPC" , "sk_lock-AF_BLUETOOTH", "sk_lock-IUCV" ,
"sk_lock-AF_RXRPC" , "sk_lock-AF_ISDN" , "sk_lock-AF_PHONET" ,
"sk_lock-AF_IEEE802154", "sk_lock-AF_CAIF" , "sk_lock-AF_ALG" ,
- "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_MAX"
+ "sk_lock-AF_NFC" , "sk_lock-AF_VSOCK" , "sk_lock-AF_KCM" ,
+ "sk_lock-AF_MAX"
};
static const char *const af_family_slock_key_strings[AF_MAX+1] = {
"slock-AF_UNSPEC", "slock-AF_UNIX" , "slock-AF_INET" ,
@@ -279,7 +238,8 @@ static const char *const af_family_slock_key_strings[AF_MAX+1] = {
"slock-AF_TIPC" , "slock-AF_BLUETOOTH", "slock-AF_IUCV" ,
"slock-AF_RXRPC" , "slock-AF_ISDN" , "slock-AF_PHONET" ,
"slock-AF_IEEE802154", "slock-AF_CAIF" , "slock-AF_ALG" ,
- "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_MAX"
+ "slock-AF_NFC" , "slock-AF_VSOCK" ,"slock-AF_KCM" ,
+ "slock-AF_MAX"
};
static const char *const af_family_clock_key_strings[AF_MAX+1] = {
"clock-AF_UNSPEC", "clock-AF_UNIX" , "clock-AF_INET" ,
@@ -295,7 +255,8 @@ static const char *const af_family_clock_key_strings[AF_MAX+1] = {
"clock-AF_TIPC" , "clock-AF_BLUETOOTH", "clock-AF_IUCV" ,
"clock-AF_RXRPC" , "clock-AF_ISDN" , "clock-AF_PHONET" ,
"clock-AF_IEEE802154", "clock-AF_CAIF" , "clock-AF_ALG" ,
- "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_MAX"
+ "clock-AF_NFC" , "clock-AF_VSOCK" , "clock-AF_KCM" ,
+ "clock-AF_MAX"
};
/*
@@ -433,8 +394,6 @@ static bool sock_needs_netstamp(const struct sock *sk)
}
}
-#define SK_FLAGS_TIMESTAMP ((1UL << SOCK_TIMESTAMP) | (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE))
-
static void sock_disable_timestamp(struct sock *sk, unsigned long flags)
{
if (sk->sk_flags & flags) {
@@ -874,7 +833,8 @@ set_rcvbuf:
if (val & SOF_TIMESTAMPING_OPT_ID &&
!(sk->sk_tsflags & SOF_TIMESTAMPING_OPT_ID)) {
- if (sk->sk_protocol == IPPROTO_TCP) {
+ if (sk->sk_protocol == IPPROTO_TCP &&
+ sk->sk_type == SOCK_STREAM) {
if (sk->sk_state != TCP_ESTABLISHED) {
ret = -EINVAL;
break;
@@ -933,6 +893,32 @@ set_rcvbuf:
}
break;
+ case SO_ATTACH_REUSEPORT_CBPF:
+ ret = -EINVAL;
+ if (optlen == sizeof(struct sock_fprog)) {
+ struct sock_fprog fprog;
+
+ ret = -EFAULT;
+ if (copy_from_user(&fprog, optval, sizeof(fprog)))
+ break;
+
+ ret = sk_reuseport_attach_filter(&fprog, sk);
+ }
+ break;
+
+ case SO_ATTACH_REUSEPORT_EBPF:
+ ret = -EINVAL;
+ if (optlen == sizeof(u32)) {
+ u32 ufd;
+
+ ret = -EFAULT;
+ if (copy_from_user(&ufd, optval, sizeof(ufd)))
+ break;
+
+ ret = sk_reuseport_attach_bpf(ufd, sk);
+ }
+ break;
+
case SO_DETACH_FILTER:
ret = sk_detach_filter(sk);
break;
@@ -1004,6 +990,10 @@ set_rcvbuf:
sk->sk_incoming_cpu = val;
break;
+ case SO_CNX_ADVICE:
+ if (val == 1)
+ dst_negative_advice(sk);
+ break;
default:
ret = -ENOPROTOOPT;
break;
@@ -1363,6 +1353,7 @@ static struct sock *sk_prot_alloc(struct proto *prot, gfp_t priority,
if (!try_module_get(prot->owner))
goto out_free_sec;
sk_tx_queue_clear(sk);
+ cgroup_sk_alloc(&sk->sk_cgrp_data);
}
return sk;
@@ -1385,6 +1376,7 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
owner = prot->owner;
slab = prot->slab;
+ cgroup_sk_free(&sk->sk_cgrp_data);
security_sk_free(sk);
if (slab != NULL)
kmem_cache_free(slab, sk);
@@ -1393,17 +1385,6 @@ static void sk_prot_free(struct proto *prot, struct sock *sk)
module_put(owner);
}
-#if IS_ENABLED(CONFIG_CGROUP_NET_PRIO)
-void sock_update_netprioidx(struct sock *sk)
-{
- if (in_interrupt())
- return;
-
- sk->sk_cgrp_prioidx = task_netprioidx(current);
-}
-EXPORT_SYMBOL_GPL(sock_update_netprioidx);
-#endif
-
/**
* sk_alloc - All socket objects are allocated here
* @net: the applicable net namespace
@@ -1432,8 +1413,8 @@ struct sock *sk_alloc(struct net *net, int family, gfp_t priority,
sock_net_set(sk, net);
atomic_set(&sk->sk_wmem_alloc, 1);
- sock_update_classid(sk);
- sock_update_netprioidx(sk);
+ sock_update_classid(&sk->sk_cgrp_data);
+ sock_update_netprioidx(&sk->sk_cgrp_data);
}
return sk;
@@ -1453,6 +1434,8 @@ void sk_destruct(struct sock *sk)
sk_filter_uncharge(sk, filter);
RCU_INIT_POINTER(sk->sk_filter, NULL);
}
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
sock_disable_timestamp(sk, SK_FLAGS_TIMESTAMP);
@@ -1488,12 +1471,6 @@ void sk_free(struct sock *sk)
}
EXPORT_SYMBOL(sk_free);
-static void sk_update_clone(const struct sock *sk, struct sock *newsk)
-{
- if (mem_cgroup_sockets_enabled && sk->sk_cgrp)
- sock_update_memcg(newsk);
-}
-
/**
* sk_clone_lock - clone a socket, and lock its clone
* @sk: the socket to clone
@@ -1530,7 +1507,6 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
skb_queue_head_init(&newsk->sk_receive_queue);
skb_queue_head_init(&newsk->sk_write_queue);
- spin_lock_init(&newsk->sk_dst_lock);
rwlock_init(&newsk->sk_callback_lock);
lockdep_set_class_and_name(&newsk->sk_callback_lock,
af_callback_keys + newsk->sk_family,
@@ -1553,7 +1529,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
*/
is_charged = sk_filter_charge(newsk, filter);
- if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk))) {
+ if (unlikely(!is_charged || xfrm_sk_clone_policy(newsk, sk))) {
/* It is still raw copy of parent, so invalidate
* destructor and make plain sk_free() */
newsk->sk_destruct = NULL;
@@ -1562,6 +1538,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
newsk = NULL;
goto out;
}
+ RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
newsk->sk_err = 0;
newsk->sk_priority = 0;
@@ -1589,7 +1566,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority)
sk_set_socket(newsk, NULL);
newsk->sk_wq = NULL;
- sk_update_clone(sk, newsk);
+ if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+ sock_update_memcg(newsk);
if (newsk->sk_prot->sockets_allocated)
sk_sockets_allocated_inc(newsk);
@@ -1607,7 +1585,7 @@ void sk_setup_caps(struct sock *sk, struct dst_entry *dst)
{
u32 max_segs = 1;
- __sk_dst_set(sk, dst);
+ sk_dst_set(sk, dst);
sk->sk_route_caps = dst->dev->features;
if (sk->sk_route_caps & NETIF_F_GSO)
sk->sk_route_caps |= NETIF_F_GSO_SOFTWARE;
@@ -1815,7 +1793,7 @@ static long sock_wait_for_wmem(struct sock *sk, long timeo)
{
DEFINE_WAIT(wait);
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
for (;;) {
if (!timeo)
break;
@@ -1861,7 +1839,7 @@ struct sk_buff *sock_alloc_send_pskb(struct sock *sk, unsigned long header_len,
if (sk_wmem_alloc_get(sk) < sk->sk_sndbuf)
break;
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
err = -EAGAIN;
if (!timeo)
@@ -1933,7 +1911,7 @@ EXPORT_SYMBOL(sock_cmsg_send);
bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
{
if (pfrag->page) {
- if (atomic_read(&pfrag->page->_count) == 1) {
+ if (page_ref_count(pfrag->page) == 1) {
pfrag->offset = 0;
return true;
}
@@ -1944,8 +1922,10 @@ bool skb_page_frag_refill(unsigned int sz, struct page_frag *pfrag, gfp_t gfp)
pfrag->offset = 0;
if (SKB_FRAG_PAGE_ORDER) {
- pfrag->page = alloc_pages((gfp & ~__GFP_WAIT) | __GFP_COMP |
- __GFP_NOWARN | __GFP_NORETRY,
+ /* Avoid direct reclaim but allow kswapd to wake */
+ pfrag->page = alloc_pages((gfp & ~__GFP_DIRECT_RECLAIM) |
+ __GFP_COMP | __GFP_NOWARN |
+ __GFP_NORETRY,
SKB_FRAG_PAGE_ORDER);
if (likely(pfrag->page)) {
pfrag->size = PAGE_SIZE << SKB_FRAG_PAGE_ORDER;
@@ -2046,9 +2026,9 @@ int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb)
DEFINE_WAIT(wait);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
rc = sk_wait_event(sk, timeo, skb_peek_tail(&sk->sk_receive_queue) != skb);
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
finish_wait(sk_sleep(sk), &wait);
return rc;
}
@@ -2069,27 +2049,27 @@ int __sk_mem_schedule(struct sock *sk, int size, int kind)
struct proto *prot = sk->sk_prot;
int amt = sk_mem_pages(size);
long allocated;
- int parent_status = UNDER_LIMIT;
sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
- allocated = sk_memory_allocated_add(sk, amt, &parent_status);
+ allocated = sk_memory_allocated_add(sk, amt);
+
+ if (mem_cgroup_sockets_enabled && sk->sk_memcg &&
+ !mem_cgroup_charge_skmem(sk->sk_memcg, amt))
+ goto suppress_allocation;
/* Under limit. */
- if (parent_status == UNDER_LIMIT &&
- allocated <= sk_prot_mem_limits(sk, 0)) {
+ if (allocated <= sk_prot_mem_limits(sk, 0)) {
sk_leave_memory_pressure(sk);
return 1;
}
- /* Under pressure. (we or our parents) */
- if ((parent_status > SOFT_LIMIT) ||
- allocated > sk_prot_mem_limits(sk, 1))
+ /* Under pressure. */
+ if (allocated > sk_prot_mem_limits(sk, 1))
sk_enter_memory_pressure(sk);
- /* Over hard limit (we or our parents) */
- if ((parent_status == OVER_LIMIT) ||
- (allocated > sk_prot_mem_limits(sk, 2)))
+ /* Over hard limit. */
+ if (allocated > sk_prot_mem_limits(sk, 2))
goto suppress_allocation;
/* guarantee minimum buffer size under pressure */
@@ -2138,6 +2118,9 @@ suppress_allocation:
sk_memory_allocated_sub(sk, amt);
+ if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+ mem_cgroup_uncharge_skmem(sk->sk_memcg, amt);
+
return 0;
}
EXPORT_SYMBOL(__sk_mem_schedule);
@@ -2153,6 +2136,9 @@ void __sk_mem_reclaim(struct sock *sk, int amount)
sk_memory_allocated_sub(sk, amount);
sk->sk_forward_alloc -= amount << SK_MEM_QUANTUM_SHIFT;
+ if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+ mem_cgroup_uncharge_skmem(sk->sk_memcg, amount);
+
if (sk_under_memory_pressure(sk) &&
(sk_memory_allocated(sk) < sk_prot_mem_limits(sk, 0)))
sk_leave_memory_pressure(sk);
@@ -2281,7 +2267,7 @@ static void sock_def_wakeup(struct sock *sk)
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible_all(&wq->wait);
rcu_read_unlock();
}
@@ -2292,7 +2278,7 @@ static void sock_def_error_report(struct sock *sk)
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible_poll(&wq->wait, POLLERR);
sk_wake_async(sk, SOCK_WAKE_IO, POLL_ERR);
rcu_read_unlock();
@@ -2304,7 +2290,7 @@ static void sock_def_readable(struct sock *sk)
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, POLLIN | POLLPRI |
POLLRDNORM | POLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
@@ -2322,7 +2308,7 @@ static void sock_def_write_space(struct sock *sk)
*/
if ((atomic_read(&sk->sk_wmem_alloc) << 1) <= sk->sk_sndbuf) {
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
POLLWRNORM | POLLWRBAND);
@@ -2386,7 +2372,6 @@ void sock_init_data(struct socket *sock, struct sock *sk)
} else
sk->sk_wq = NULL;
- spin_lock_init(&sk->sk_dst_lock);
rwlock_init(&sk->sk_callback_lock);
lockdep_set_class_and_name(&sk->sk_callback_lock,
af_callback_keys + sk->sk_family,
diff --git a/net/core/sock_diag.c b/net/core/sock_diag.c
index 0c1d58d43f67..a996ce8c8fb2 100644
--- a/net/core/sock_diag.c
+++ b/net/core/sock_diag.c
@@ -214,7 +214,7 @@ void sock_diag_unregister(const struct sock_diag_handler *hnld)
}
EXPORT_SYMBOL_GPL(sock_diag_unregister);
-static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
+static int __sock_diag_cmd(struct sk_buff *skb, struct nlmsghdr *nlh)
{
int err;
struct sock_diag_req *req = nlmsg_data(nlh);
@@ -234,8 +234,12 @@ static int __sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
hndl = sock_diag_handlers[req->sdiag_family];
if (hndl == NULL)
err = -ENOENT;
- else
+ else if (nlh->nlmsg_type == SOCK_DIAG_BY_FAMILY)
err = hndl->dump(skb, nlh);
+ else if (nlh->nlmsg_type == SOCK_DESTROY && hndl->destroy)
+ err = hndl->destroy(skb, nlh);
+ else
+ err = -EOPNOTSUPP;
mutex_unlock(&sock_diag_table_mutex);
return err;
@@ -261,7 +265,8 @@ static int sock_diag_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
return ret;
case SOCK_DIAG_BY_FAMILY:
- return __sock_diag_rcv_msg(skb, nlh);
+ case SOCK_DESTROY:
+ return __sock_diag_cmd(skb, nlh);
default:
return -EINVAL;
}
@@ -295,6 +300,18 @@ static int sock_diag_bind(struct net *net, int group)
return 0;
}
+int sock_diag_destroy(struct sock *sk, int err)
+{
+ if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
+ if (!sk->sk_prot->diag_destroy)
+ return -EOPNOTSUPP;
+
+ return sk->sk_prot->diag_destroy(sk, err);
+}
+EXPORT_SYMBOL_GPL(sock_diag_destroy);
+
static int __net_init diag_net_init(struct net *net)
{
struct netlink_kernel_cfg cfg = {
diff --git a/net/core/sock_reuseport.c b/net/core/sock_reuseport.c
new file mode 100644
index 000000000000..e92b759d906c
--- /dev/null
+++ b/net/core/sock_reuseport.c
@@ -0,0 +1,258 @@
+/*
+ * To speed up listener socket lookup, create an array to store all sockets
+ * listening on the same port. This allows a decision to be made after finding
+ * the first socket. An optional BPF program can also be configured for
+ * selecting the socket index from the array of available sockets.
+ */
+
+#include <net/sock_reuseport.h>
+#include <linux/bpf.h>
+#include <linux/rcupdate.h>
+
+#define INIT_SOCKS 128
+
+static DEFINE_SPINLOCK(reuseport_lock);
+
+static struct sock_reuseport *__reuseport_alloc(u16 max_socks)
+{
+ size_t size = sizeof(struct sock_reuseport) +
+ sizeof(struct sock *) * max_socks;
+ struct sock_reuseport *reuse = kzalloc(size, GFP_ATOMIC);
+
+ if (!reuse)
+ return NULL;
+
+ reuse->max_socks = max_socks;
+
+ RCU_INIT_POINTER(reuse->prog, NULL);
+ return reuse;
+}
+
+int reuseport_alloc(struct sock *sk)
+{
+ struct sock_reuseport *reuse;
+
+ /* bh lock used since this function call may precede hlist lock in
+ * soft irq of receive path or setsockopt from process context
+ */
+ spin_lock_bh(&reuseport_lock);
+ WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock)),
+ "multiple allocations for the same socket");
+ reuse = __reuseport_alloc(INIT_SOCKS);
+ if (!reuse) {
+ spin_unlock_bh(&reuseport_lock);
+ return -ENOMEM;
+ }
+
+ reuse->socks[0] = sk;
+ reuse->num_socks = 1;
+ rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+ spin_unlock_bh(&reuseport_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(reuseport_alloc);
+
+static struct sock_reuseport *reuseport_grow(struct sock_reuseport *reuse)
+{
+ struct sock_reuseport *more_reuse;
+ u32 more_socks_size, i;
+
+ more_socks_size = reuse->max_socks * 2U;
+ if (more_socks_size > U16_MAX)
+ return NULL;
+
+ more_reuse = __reuseport_alloc(more_socks_size);
+ if (!more_reuse)
+ return NULL;
+
+ more_reuse->max_socks = more_socks_size;
+ more_reuse->num_socks = reuse->num_socks;
+ more_reuse->prog = reuse->prog;
+
+ memcpy(more_reuse->socks, reuse->socks,
+ reuse->num_socks * sizeof(struct sock *));
+
+ for (i = 0; i < reuse->num_socks; ++i)
+ rcu_assign_pointer(reuse->socks[i]->sk_reuseport_cb,
+ more_reuse);
+
+ /* Note: we use kfree_rcu here instead of reuseport_free_rcu so
+ * that reuse and more_reuse can temporarily share a reference
+ * to prog.
+ */
+ kfree_rcu(reuse, rcu);
+ return more_reuse;
+}
+
+/**
+ * reuseport_add_sock - Add a socket to the reuseport group of another.
+ * @sk: New socket to add to the group.
+ * @sk2: Socket belonging to the existing reuseport group.
+ * May return ENOMEM and not add socket to group under memory pressure.
+ */
+int reuseport_add_sock(struct sock *sk, struct sock *sk2)
+{
+ struct sock_reuseport *reuse;
+
+ if (!rcu_access_pointer(sk2->sk_reuseport_cb)) {
+ int err = reuseport_alloc(sk2);
+
+ if (err)
+ return err;
+ }
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk2->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock)),
+ WARN_ONCE(rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock)),
+ "socket already in reuseport group");
+
+ if (reuse->num_socks == reuse->max_socks) {
+ reuse = reuseport_grow(reuse);
+ if (!reuse) {
+ spin_unlock_bh(&reuseport_lock);
+ return -ENOMEM;
+ }
+ }
+
+ reuse->socks[reuse->num_socks] = sk;
+ /* paired with smp_rmb() in reuseport_select_sock() */
+ smp_wmb();
+ reuse->num_socks++;
+ rcu_assign_pointer(sk->sk_reuseport_cb, reuse);
+
+ spin_unlock_bh(&reuseport_lock);
+
+ return 0;
+}
+EXPORT_SYMBOL(reuseport_add_sock);
+
+static void reuseport_free_rcu(struct rcu_head *head)
+{
+ struct sock_reuseport *reuse;
+
+ reuse = container_of(head, struct sock_reuseport, rcu);
+ if (reuse->prog)
+ bpf_prog_destroy(reuse->prog);
+ kfree(reuse);
+}
+
+void reuseport_detach_sock(struct sock *sk)
+{
+ struct sock_reuseport *reuse;
+ int i;
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ rcu_assign_pointer(sk->sk_reuseport_cb, NULL);
+
+ for (i = 0; i < reuse->num_socks; i++) {
+ if (reuse->socks[i] == sk) {
+ reuse->socks[i] = reuse->socks[reuse->num_socks - 1];
+ reuse->num_socks--;
+ if (reuse->num_socks == 0)
+ call_rcu(&reuse->rcu, reuseport_free_rcu);
+ break;
+ }
+ }
+ spin_unlock_bh(&reuseport_lock);
+}
+EXPORT_SYMBOL(reuseport_detach_sock);
+
+static struct sock *run_bpf(struct sock_reuseport *reuse, u16 socks,
+ struct bpf_prog *prog, struct sk_buff *skb,
+ int hdr_len)
+{
+ struct sk_buff *nskb = NULL;
+ u32 index;
+
+ if (skb_shared(skb)) {
+ nskb = skb_clone(skb, GFP_ATOMIC);
+ if (!nskb)
+ return NULL;
+ skb = nskb;
+ }
+
+ /* temporarily advance data past protocol header */
+ if (!pskb_pull(skb, hdr_len)) {
+ kfree_skb(nskb);
+ return NULL;
+ }
+ index = bpf_prog_run_save_cb(prog, skb);
+ __skb_push(skb, hdr_len);
+
+ consume_skb(nskb);
+
+ if (index >= socks)
+ return NULL;
+
+ return reuse->socks[index];
+}
+
+/**
+ * reuseport_select_sock - Select a socket from an SO_REUSEPORT group.
+ * @sk: First socket in the group.
+ * @hash: When no BPF filter is available, use this hash to select.
+ * @skb: skb to run through BPF filter.
+ * @hdr_len: BPF filter expects skb data pointer at payload data. If
+ * the skb does not yet point at the payload, this parameter represents
+ * how far the pointer needs to advance to reach the payload.
+ * Returns a socket that should receive the packet (or NULL on error).
+ */
+struct sock *reuseport_select_sock(struct sock *sk,
+ u32 hash,
+ struct sk_buff *skb,
+ int hdr_len)
+{
+ struct sock_reuseport *reuse;
+ struct bpf_prog *prog;
+ struct sock *sk2 = NULL;
+ u16 socks;
+
+ rcu_read_lock();
+ reuse = rcu_dereference(sk->sk_reuseport_cb);
+
+ /* if memory allocation failed or add call is not yet complete */
+ if (!reuse)
+ goto out;
+
+ prog = rcu_dereference(reuse->prog);
+ socks = READ_ONCE(reuse->num_socks);
+ if (likely(socks)) {
+ /* paired with smp_wmb() in reuseport_add_sock() */
+ smp_rmb();
+
+ if (prog && skb)
+ sk2 = run_bpf(reuse, socks, prog, skb, hdr_len);
+ else
+ sk2 = reuse->socks[reciprocal_scale(hash, socks)];
+ }
+
+out:
+ rcu_read_unlock();
+ return sk2;
+}
+EXPORT_SYMBOL(reuseport_select_sock);
+
+struct bpf_prog *
+reuseport_attach_prog(struct sock *sk, struct bpf_prog *prog)
+{
+ struct sock_reuseport *reuse;
+ struct bpf_prog *old_prog;
+
+ spin_lock_bh(&reuseport_lock);
+ reuse = rcu_dereference_protected(sk->sk_reuseport_cb,
+ lockdep_is_held(&reuseport_lock));
+ old_prog = rcu_dereference_protected(reuse->prog,
+ lockdep_is_held(&reuseport_lock));
+ rcu_assign_pointer(reuse->prog, prog);
+ spin_unlock_bh(&reuseport_lock);
+
+ return old_prog;
+}
+EXPORT_SYMBOL(reuseport_attach_prog);
diff --git a/net/core/stream.c b/net/core/stream.c
index d70f77a0c889..159516a11b7e 100644
--- a/net/core/stream.c
+++ b/net/core/stream.c
@@ -35,11 +35,11 @@ void sk_stream_write_space(struct sock *sk)
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible_poll(&wq->wait, POLLOUT |
POLLWRNORM | POLLWRBAND);
if (wq && wq->fasync_list && !(sk->sk_shutdown & SEND_SHUTDOWN))
- sock_wake_async(sock, SOCK_WAKE_SPACE, POLL_OUT);
+ sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
rcu_read_unlock();
}
}
@@ -126,7 +126,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
current_timeo = vm_wait = (prandom_u32() % (HZ / 5)) + 2;
while (1) {
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
@@ -139,7 +139,7 @@ int sk_stream_wait_memory(struct sock *sk, long *timeo_p)
}
if (signal_pending(current))
goto do_interrupted;
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
if (sk_stream_memory_free(sk) && !vm_wait)
break;
diff --git a/net/core/sysctl_net_core.c b/net/core/sysctl_net_core.c
index 95b6139d710c..a6beb7b6ae55 100644
--- a/net/core/sysctl_net_core.c
+++ b/net/core/sysctl_net_core.c
@@ -26,6 +26,7 @@ static int zero = 0;
static int one = 1;
static int min_sndbuf = SOCK_MIN_SNDBUF;
static int min_rcvbuf = SOCK_MIN_RCVBUF;
+static int max_skb_frags = MAX_SKB_FRAGS;
static int net_msg_warn; /* Unused, but still a sysctl */
@@ -392,6 +393,15 @@ static struct ctl_table net_core_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec
},
+ {
+ .procname = "max_skb_frags",
+ .data = &sysctl_max_skb_frags,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one,
+ .extra2 = &max_skb_frags,
+ },
{ }
};
diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
index 5684e14932bd..9c67a961ba53 100644
--- a/net/dccp/ipv4.c
+++ b/net/dccp/ipv4.c
@@ -204,8 +204,6 @@ void dccp_req_err(struct sock *sk, u64 seq)
* ICMPs are not backlogged, hence we cannot get an established
* socket here.
*/
- WARN_ON(req->sk);
-
if (!between48(seq, dccp_rsk(req)->dreq_iss, dccp_rsk(req)->dreq_gss)) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
} else {
@@ -802,7 +800,7 @@ static int dccp_v4_rcv(struct sk_buff *skb)
}
lookup:
- sk = __inet_lookup_skb(&dccp_hashinfo, skb,
+ sk = __inet_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
dh->dccph_sport, dh->dccph_dport);
if (!sk) {
dccp_pr_debug("failed to look up flow ID in table and "
@@ -824,26 +822,26 @@ lookup:
if (sk->sk_state == DCCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
- struct sock *nsk = NULL;
+ struct sock *nsk;
sk = req->rsk_listener;
- if (likely(sk->sk_state == DCCP_LISTEN)) {
- nsk = dccp_check_req(sk, skb, req);
- } else {
+ if (unlikely(sk->sk_state != DCCP_LISTEN)) {
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
+ sock_hold(sk);
+ nsk = dccp_check_req(sk, skb, req);
if (!nsk) {
reqsk_put(req);
- goto discard_it;
+ goto discard_and_relse;
}
if (nsk == sk) {
- sock_hold(sk);
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
dccp_v4_ctl_send_reset(sk, skb);
- goto discard_it;
+ goto discard_and_relse;
} else {
+ sock_put(sk);
return 0;
}
}
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index db5fc2440a23..4663a01d5039 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -202,7 +202,9 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
security_req_classify_flow(req, flowi6_to_flowi(&fl6));
- final_p = fl6_update_dst(&fl6, np->opt, &final);
+ rcu_read_lock();
+ final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
+ rcu_read_unlock();
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
if (IS_ERR(dst)) {
@@ -219,7 +221,10 @@ static int dccp_v6_send_response(const struct sock *sk, struct request_sock *req
&ireq->ir_v6_loc_addr,
&ireq->ir_v6_rmt_addr);
fl6.daddr = ireq->ir_v6_rmt_addr;
- err = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
+ rcu_read_lock();
+ err = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt),
+ np->tclass);
+ rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -387,6 +392,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
struct inet_request_sock *ireq = inet_rsk(req);
struct ipv6_pinfo *newnp;
const struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_txoptions *opt;
struct inet_sock *newinet;
struct dccp6_sock *newdp6;
struct sock *newsk;
@@ -453,7 +459,7 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
* comment in that function for the gory details. -acme
*/
- __ip6_dst_store(newsk, dst, NULL, NULL);
+ ip6_dst_store(newsk, dst, NULL, NULL);
newsk->sk_route_caps = dst->dev->features & ~(NETIF_F_IP_CSUM |
NETIF_F_TSO);
newdp6 = (struct dccp6_sock *)newsk;
@@ -488,13 +494,15 @@ static struct sock *dccp_v6_request_recv_sock(const struct sock *sk,
* Yes, keeping reference count would be much more clever, but we make
* one more one thing there: reattach optmem to newsk.
*/
- if (np->opt != NULL)
- newnp->opt = ipv6_dup_options(newsk, np->opt);
-
+ opt = rcu_dereference(np->opt);
+ if (opt) {
+ opt = ipv6_dup_options(newsk, opt);
+ RCU_INIT_POINTER(newnp->opt, opt);
+ }
inet_csk(newsk)->icsk_ext_hdr_len = 0;
- if (newnp->opt != NULL)
- inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
- newnp->opt->opt_flen);
+ if (opt)
+ inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen +
+ opt->opt_flen;
dccp_sync_mss(newsk, dst_mtu(dst));
@@ -660,7 +668,7 @@ static int dccp_v6_rcv(struct sk_buff *skb)
DCCP_SKB_CB(skb)->dccpd_ack_seq = dccp_hdr_ack_seq(skb);
lookup:
- sk = __inet6_lookup_skb(&dccp_hashinfo, skb,
+ sk = __inet6_lookup_skb(&dccp_hashinfo, skb, __dccp_hdr_len(dh),
dh->dccph_sport, dh->dccph_dport,
inet6_iif(skb));
if (!sk) {
@@ -683,26 +691,26 @@ lookup:
if (sk->sk_state == DCCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
- struct sock *nsk = NULL;
+ struct sock *nsk;
sk = req->rsk_listener;
- if (likely(sk->sk_state == DCCP_LISTEN)) {
- nsk = dccp_check_req(sk, skb, req);
- } else {
+ if (unlikely(sk->sk_state != DCCP_LISTEN)) {
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
+ sock_hold(sk);
+ nsk = dccp_check_req(sk, skb, req);
if (!nsk) {
reqsk_put(req);
- goto discard_it;
+ goto discard_and_relse;
}
if (nsk == sk) {
- sock_hold(sk);
reqsk_put(req);
} else if (dccp_child_process(sk, nsk, skb)) {
dccp_v6_ctl_send_reset(sk, skb);
- goto discard_it;
+ goto discard_and_relse;
} else {
+ sock_put(sk);
return 0;
}
}
@@ -757,6 +765,7 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
struct ipv6_pinfo *np = inet6_sk(sk);
struct dccp_sock *dp = dccp_sk(sk);
struct in6_addr *saddr = NULL, *final_p, final;
+ struct ipv6_txoptions *opt;
struct flowi6 fl6;
struct dst_entry *dst;
int addr_type;
@@ -856,7 +865,8 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.fl6_sport = inet->inet_sport;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- final_p = fl6_update_dst(&fl6, np->opt, &final);
+ opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+ final_p = fl6_update_dst(&fl6, opt, &final);
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
if (IS_ERR(dst)) {
@@ -873,12 +883,11 @@ static int dccp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
np->saddr = *saddr;
inet->inet_rcv_saddr = LOOPBACK4_IPV6;
- __ip6_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, NULL, NULL);
icsk->icsk_ext_hdr_len = 0;
- if (np->opt != NULL)
- icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
- np->opt->opt_nflen);
+ if (opt)
+ icsk->icsk_ext_hdr_len = opt->opt_flen + opt->opt_nflen;
inet->inet_dport = usin->sin6_port;
@@ -984,7 +993,7 @@ static struct proto dccp_v6_prot = {
.sendmsg = dccp_sendmsg,
.recvmsg = dccp_recvmsg,
.backlog_rcv = dccp_v6_do_rcv,
- .hash = inet_hash,
+ .hash = inet6_hash,
.unhash = inet_unhash,
.accept = inet_csk_accept,
.get_port = inet_csk_get_port,
diff --git a/net/dccp/output.c b/net/dccp/output.c
index 4ce912e691d0..b66c84db0766 100644
--- a/net/dccp/output.c
+++ b/net/dccp/output.c
@@ -201,7 +201,7 @@ void dccp_write_space(struct sock *sk)
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible(&wq->wait);
/* Should agree with poll, otherwise some programs break */
if (sock_writeable(sk))
diff --git a/net/dccp/proto.c b/net/dccp/proto.c
index b5cf13a28009..41e65804ddf5 100644
--- a/net/dccp/proto.c
+++ b/net/dccp/proto.c
@@ -339,8 +339,7 @@ unsigned int dccp_poll(struct file *file, struct socket *sock,
if (sk_stream_is_writeable(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else { /* send SIGIO later */
- set_bit(SOCK_ASYNC_NOSPACE,
- &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
/* Race breaker. If space is freed after
diff --git a/net/decnet/af_decnet.c b/net/decnet/af_decnet.c
index 675cf94e04f8..13d6b1a6e0fc 100644
--- a/net/decnet/af_decnet.c
+++ b/net/decnet/af_decnet.c
@@ -678,6 +678,9 @@ static int dn_create(struct net *net, struct socket *sock, int protocol,
{
struct sock *sk;
+ if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+ return -EINVAL;
+
if (!net_eq(net, &init_net))
return -EAFNOSUPPORT;
@@ -1747,9 +1750,9 @@ static int dn_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
}
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
sk_wait_event(sk, &timeo, dn_data_ready(sk, queue, flags, target));
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
finish_wait(sk_sleep(sk), &wait);
}
@@ -2004,10 +2007,10 @@ static int dn_sendmsg(struct socket *sock, struct msghdr *msg, size_t size)
}
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
sk_wait_event(sk, &timeo,
!dn_queue_too_long(scp, queue, flags));
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
finish_wait(sk_sleep(sk), &wait);
continue;
}
diff --git a/net/dns_resolver/dns_query.c b/net/dns_resolver/dns_query.c
index 4677b6fa6dda..ecc28cff08ab 100644
--- a/net/dns_resolver/dns_query.c
+++ b/net/dns_resolver/dns_query.c
@@ -67,7 +67,7 @@
* Returns the size of the result on success, -ve error code otherwise.
*/
int dns_query(const char *type, const char *name, size_t namelen,
- const char *options, char **_result, time_t *_expiry)
+ const char *options, char **_result, time64_t *_expiry)
{
struct key *rkey;
const struct user_key_payload *upayload;
diff --git a/net/dsa/dsa.c b/net/dsa/dsa.c
index 1eba07feb34a..c28c47463b7e 100644
--- a/net/dsa/dsa.c
+++ b/net/dsa/dsa.c
@@ -21,8 +21,10 @@
#include <linux/of_mdio.h>
#include <linux/of_platform.h>
#include <linux/of_net.h>
+#include <linux/of_gpio.h>
#include <linux/sysfs.h>
#include <linux/phy_fixed.h>
+#include <linux/gpio/consumer.h>
#include "dsa_priv.h"
char dsa_driver_version[] = "0.1";
@@ -428,36 +430,30 @@ static void dsa_switch_destroy(struct dsa_switch *ds)
hwmon_device_unregister(ds->hwmon_dev);
#endif
- /* Disable configuration of the CPU and DSA ports */
+ /* Destroy network devices for physical switch ports. */
for (port = 0; port < DSA_MAX_PORTS; port++) {
- if (!(dsa_is_cpu_port(ds, port) || dsa_is_dsa_port(ds, port)))
+ if (!(ds->phys_port_mask & (1 << port)))
continue;
+ if (!ds->ports[port])
+ continue;
+
+ dsa_slave_destroy(ds->ports[port]);
+ }
+
+ /* Remove any fixed link PHYs */
+ for (port = 0; port < DSA_MAX_PORTS; port++) {
port_dn = cd->port_dn[port];
if (of_phy_is_fixed_link(port_dn)) {
phydev = of_phy_find_device(port_dn);
if (phydev) {
- int addr = phydev->addr;
-
phy_device_free(phydev);
of_node_put(port_dn);
- fixed_phy_del(addr);
+ fixed_phy_unregister(phydev);
}
}
}
- /* Destroy network devices for physical switch ports. */
- for (port = 0; port < DSA_MAX_PORTS; port++) {
- if (!(ds->phys_port_mask & (1 << port)))
- continue;
-
- if (!ds->ports[port])
- continue;
-
- unregister_netdev(ds->ports[port]);
- free_netdev(ds->ports[port]);
- }
-
mdiobus_unregister(ds->slave_mii_bus);
}
@@ -506,33 +502,6 @@ static int dsa_switch_resume(struct dsa_switch *ds)
}
#endif
-
-/* link polling *************************************************************/
-static void dsa_link_poll_work(struct work_struct *ugly)
-{
- struct dsa_switch_tree *dst;
- int i;
-
- dst = container_of(ugly, struct dsa_switch_tree, link_poll_work);
-
- for (i = 0; i < dst->pd->nr_chips; i++) {
- struct dsa_switch *ds = dst->ds[i];
-
- if (ds != NULL && ds->drv->poll_link != NULL)
- ds->drv->poll_link(ds);
- }
-
- mod_timer(&dst->link_poll_timer, round_jiffies(jiffies + HZ));
-}
-
-static void dsa_link_poll_timer(unsigned long _dst)
-{
- struct dsa_switch_tree *dst = (void *)_dst;
-
- schedule_work(&dst->link_poll_work);
-}
-
-
/* platform driver init and cleanup *****************************************/
static int dev_is_class(struct device *dev, void *class)
{
@@ -688,6 +657,9 @@ static int dsa_of_probe(struct device *dev)
const char *port_name;
int chip_index, port_index;
const unsigned int *sw_addr, *port_reg;
+ int gpio;
+ enum of_gpio_flags of_flags;
+ unsigned long flags;
u32 eeprom_len;
int ret;
@@ -766,6 +738,19 @@ static int dsa_of_probe(struct device *dev)
put_device(cd->host_dev);
cd->host_dev = &mdio_bus_switch->dev;
}
+ gpio = of_get_named_gpio_flags(child, "reset-gpios", 0,
+ &of_flags);
+ if (gpio_is_valid(gpio)) {
+ flags = (of_flags == OF_GPIO_ACTIVE_LOW ?
+ GPIOF_ACTIVE_LOW : 0);
+ ret = devm_gpio_request_one(dev, gpio, flags,
+ "switch_reset");
+ if (ret)
+ goto out_free_chip;
+
+ cd->reset = gpio_to_desc(gpio);
+ gpiod_direction_output(cd->reset, 0);
+ }
for_each_available_child_of_node(child, port) {
port_reg = of_get_property(port, "reg", NULL);
@@ -859,8 +844,6 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
}
dst->ds[i] = ds;
- if (ds->drv->poll_link != NULL)
- dst->link_poll_needed = 1;
++configured;
}
@@ -879,15 +862,6 @@ static int dsa_setup_dst(struct dsa_switch_tree *dst, struct net_device *dev,
wmb();
dev->dsa_ptr = (void *)dst;
- if (dst->link_poll_needed) {
- INIT_WORK(&dst->link_poll_work, dsa_link_poll_work);
- init_timer(&dst->link_poll_timer);
- dst->link_poll_timer.data = (unsigned long)dst;
- dst->link_poll_timer.function = dsa_link_poll_timer;
- dst->link_poll_timer.expires = round_jiffies(jiffies + HZ);
- add_timer(&dst->link_poll_timer);
- }
-
return 0;
}
@@ -939,8 +913,10 @@ static int dsa_probe(struct platform_device *pdev)
platform_set_drvdata(pdev, dst);
ret = dsa_setup_dst(dst, dev, &pdev->dev, pd);
- if (ret)
+ if (ret) {
+ dev_put(dev);
goto out;
+ }
return 0;
@@ -954,10 +930,13 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
{
int i;
- if (dst->link_poll_needed)
- del_timer_sync(&dst->link_poll_timer);
+ dst->master_netdev->dsa_ptr = NULL;
- flush_work(&dst->link_poll_work);
+ /* If we used a tagging format that doesn't have an ethertype
+ * field, make sure that all packets from this point get sent
+ * without the tag and go through the regular receive path.
+ */
+ wmb();
for (i = 0; i < dst->pd->nr_chips; i++) {
struct dsa_switch *ds = dst->ds[i];
@@ -965,6 +944,8 @@ static void dsa_remove_dst(struct dsa_switch_tree *dst)
if (ds)
dsa_switch_destroy(ds);
}
+
+ dev_put(dst->master_netdev);
}
static int dsa_remove(struct platform_device *pdev)
diff --git a/net/dsa/dsa_priv.h b/net/dsa/dsa_priv.h
index 311796c809af..1d1a54687e4a 100644
--- a/net/dsa/dsa_priv.h
+++ b/net/dsa/dsa_priv.h
@@ -61,6 +61,7 @@ extern const struct dsa_device_ops notag_netdev_ops;
void dsa_slave_mii_bus_init(struct dsa_switch *ds);
int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
int port, char *name);
+void dsa_slave_destroy(struct net_device *slave_dev);
int dsa_slave_suspend(struct net_device *slave_dev);
int dsa_slave_resume(struct net_device *slave_dev);
int dsa_slave_netdevice_event(struct notifier_block *unused,
diff --git a/net/dsa/slave.c b/net/dsa/slave.c
index 7bc787b095c8..a575f0350d5a 100644
--- a/net/dsa/slave.c
+++ b/net/dsa/slave.c
@@ -15,6 +15,7 @@
#include <linux/phy_fixed.h>
#include <linux/of_net.h>
#include <linux/of_mdio.h>
+#include <linux/mdio.h>
#include <net/rtnetlink.h>
#include <net/switchdev.h>
#include <linux/if_bridge.h>
@@ -200,47 +201,6 @@ out:
return 0;
}
-static int dsa_bridge_check_vlan_range(struct dsa_switch *ds,
- const struct net_device *bridge,
- u16 vid_begin, u16 vid_end)
-{
- struct dsa_slave_priv *p;
- struct net_device *dev, *vlan_br;
- DECLARE_BITMAP(members, DSA_MAX_PORTS);
- DECLARE_BITMAP(untagged, DSA_MAX_PORTS);
- u16 vid;
- int member, err;
-
- if (!ds->drv->vlan_getnext || !vid_begin)
- return -EOPNOTSUPP;
-
- vid = vid_begin - 1;
-
- do {
- err = ds->drv->vlan_getnext(ds, &vid, members, untagged);
- if (err)
- break;
-
- if (vid > vid_end)
- break;
-
- member = find_first_bit(members, DSA_MAX_PORTS);
- if (member == DSA_MAX_PORTS)
- continue;
-
- dev = ds->ports[member];
- p = netdev_priv(dev);
- vlan_br = p->bridge_dev;
- if (vlan_br == bridge)
- continue;
-
- netdev_dbg(vlan_br, "hardware VLAN %d already in use\n", vid);
- return -EOPNOTSUPP;
- } while (vid < vid_end);
-
- return err == -ENOENT ? 0 : err;
-}
-
static int dsa_slave_port_vlan_add(struct net_device *dev,
const struct switchdev_obj_port_vlan *vlan,
struct switchdev_trans *trans)
@@ -253,15 +213,6 @@ static int dsa_slave_port_vlan_add(struct net_device *dev,
if (!ds->drv->port_vlan_prepare || !ds->drv->port_vlan_add)
return -EOPNOTSUPP;
- /* If the requested port doesn't belong to the same bridge as
- * the VLAN members, fallback to software VLAN (hopefully).
- */
- err = dsa_bridge_check_vlan_range(ds, p->bridge_dev,
- vlan->vid_begin,
- vlan->vid_end);
- if (err)
- return err;
-
err = ds->drv->port_vlan_prepare(ds, p->port, vlan, trans);
if (err)
return err;
@@ -292,41 +243,11 @@ static int dsa_slave_port_vlan_dump(struct net_device *dev,
{
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- DECLARE_BITMAP(members, DSA_MAX_PORTS);
- DECLARE_BITMAP(untagged, DSA_MAX_PORTS);
- u16 pvid, vid = 0;
- int err;
-
- if (!ds->drv->vlan_getnext || !ds->drv->port_pvid_get)
- return -EOPNOTSUPP;
-
- err = ds->drv->port_pvid_get(ds, p->port, &pvid);
- if (err)
- return err;
-
- for (;;) {
- err = ds->drv->vlan_getnext(ds, &vid, members, untagged);
- if (err)
- break;
-
- if (!test_bit(p->port, members))
- continue;
-
- memset(vlan, 0, sizeof(*vlan));
- vlan->vid_begin = vlan->vid_end = vid;
- if (vid == pvid)
- vlan->flags |= BRIDGE_VLAN_INFO_PVID;
+ if (ds->drv->port_vlan_dump)
+ return ds->drv->port_vlan_dump(ds, p->port, vlan, cb);
- if (test_bit(p->port, untagged))
- vlan->flags |= BRIDGE_VLAN_INFO_UNTAGGED;
-
- err = cb(&vlan->obj);
- if (err)
- break;
- }
-
- return err == -ENOENT ? 0 : err;
+ return -EOPNOTSUPP;
}
static int dsa_slave_port_fdb_add(struct net_device *dev,
@@ -384,31 +305,6 @@ static int dsa_slave_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
return -EOPNOTSUPP;
}
-/* Return a bitmask of all ports being currently bridged within a given bridge
- * device. Note that on leave, the mask will still return the bitmask of ports
- * currently bridged, prior to port removal, and this is exactly what we want.
- */
-static u32 dsa_slave_br_port_mask(struct dsa_switch *ds,
- struct net_device *bridge)
-{
- struct dsa_slave_priv *p;
- unsigned int port;
- u32 mask = 0;
-
- for (port = 0; port < DSA_MAX_PORTS; port++) {
- if (!dsa_is_port_initialized(ds, port))
- continue;
-
- p = netdev_priv(ds->ports[port]);
-
- if (ds->ports[port]->priv_flags & IFF_BRIDGE_PORT &&
- p->bridge_dev == bridge)
- mask |= 1 << port;
- }
-
- return mask;
-}
-
static int dsa_slave_stp_update(struct net_device *dev, u8 state)
{
struct dsa_slave_priv *p = netdev_priv(dev);
@@ -421,6 +317,24 @@ static int dsa_slave_stp_update(struct net_device *dev, u8 state)
return ret;
}
+static int dsa_slave_vlan_filtering(struct net_device *dev,
+ const struct switchdev_attr *attr,
+ struct switchdev_trans *trans)
+{
+ struct dsa_slave_priv *p = netdev_priv(dev);
+ struct dsa_switch *ds = p->parent;
+
+ /* bridge skips -EOPNOTSUPP, so skip the prepare phase */
+ if (switchdev_trans_ph_prepare(trans))
+ return 0;
+
+ if (ds->drv->port_vlan_filtering)
+ return ds->drv->port_vlan_filtering(ds, p->port,
+ attr->u.vlan_filtering);
+
+ return 0;
+}
+
static int dsa_slave_port_attr_set(struct net_device *dev,
const struct switchdev_attr *attr,
struct switchdev_trans *trans)
@@ -437,6 +351,9 @@ static int dsa_slave_port_attr_set(struct net_device *dev,
ret = ds->drv->port_stp_update(ds, p->port,
attr->u.stp_state);
break;
+ case SWITCHDEV_ATTR_ID_BRIDGE_VLAN_FILTERING:
+ ret = dsa_slave_vlan_filtering(dev, attr, trans);
+ break;
default:
ret = -EOPNOTSUPP;
break;
@@ -531,23 +448,20 @@ static int dsa_slave_bridge_port_join(struct net_device *dev,
p->bridge_dev = br;
- if (ds->drv->port_join_bridge)
- ret = ds->drv->port_join_bridge(ds, p->port,
- dsa_slave_br_port_mask(ds, br));
+ if (ds->drv->port_bridge_join)
+ ret = ds->drv->port_bridge_join(ds, p->port, br);
- return ret;
+ return ret == -EOPNOTSUPP ? 0 : ret;
}
-static int dsa_slave_bridge_port_leave(struct net_device *dev)
+static void dsa_slave_bridge_port_leave(struct net_device *dev)
{
struct dsa_slave_priv *p = netdev_priv(dev);
struct dsa_switch *ds = p->parent;
- int ret = -EOPNOTSUPP;
- if (ds->drv->port_leave_bridge)
- ret = ds->drv->port_leave_bridge(ds, p->port,
- dsa_slave_br_port_mask(ds, p->bridge_dev));
+ if (ds->drv->port_bridge_leave)
+ ds->drv->port_bridge_leave(ds, p->port);
p->bridge_dev = NULL;
@@ -555,8 +469,6 @@ static int dsa_slave_bridge_port_leave(struct net_device *dev)
* so allow it to be in BR_STATE_FORWARDING to be kept functional
*/
dsa_slave_stp_update(dev, BR_STATE_FORWARDING);
-
- return ret;
}
static int dsa_slave_port_attr_get(struct net_device *dev,
@@ -981,11 +893,15 @@ static void dsa_slave_adjust_link(struct net_device *dev)
static int dsa_slave_fixed_link_update(struct net_device *dev,
struct fixed_phy_status *status)
{
- struct dsa_slave_priv *p = netdev_priv(dev);
- struct dsa_switch *ds = p->parent;
+ struct dsa_slave_priv *p;
+ struct dsa_switch *ds;
- if (ds->drv->fixed_link_update)
- ds->drv->fixed_link_update(ds, p->port, status);
+ if (dev) {
+ p = netdev_priv(dev);
+ ds = p->parent;
+ if (ds->drv->fixed_link_update)
+ ds->drv->fixed_link_update(ds, p->port, status);
+ }
return 0;
}
@@ -997,7 +913,7 @@ static int dsa_slave_phy_connect(struct dsa_slave_priv *p,
{
struct dsa_switch *ds = p->parent;
- p->phy = ds->slave_mii_bus->phy_map[addr];
+ p->phy = mdiobus_get_phy(ds->slave_mii_bus, addr);
if (!p->phy) {
netdev_err(slave_dev, "no phy at %d\n", addr);
return -ENODEV;
@@ -1080,11 +996,10 @@ static int dsa_slave_phy_setup(struct dsa_slave_priv *p,
netdev_err(slave_dev, "failed to connect to port %d: %d\n", p->port, ret);
return ret;
}
- } else {
- netdev_info(slave_dev, "attached PHY at address %d [%s]\n",
- p->phy->addr, p->phy->drv->name);
}
+ phy_attached_info(p->phy);
+
return 0;
}
@@ -1189,19 +1104,11 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
p->old_link = -1;
p->old_duplex = -1;
- ret = dsa_slave_phy_setup(p, slave_dev);
- if (ret) {
- netdev_err(master, "error %d setting up slave phy\n", ret);
- free_netdev(slave_dev);
- return ret;
- }
-
ds->ports[port] = slave_dev;
ret = register_netdev(slave_dev);
if (ret) {
netdev_err(master, "error %d registering interface %s\n",
ret, slave_dev->name);
- phy_disconnect(p->phy);
ds->ports[port] = NULL;
free_netdev(slave_dev);
return ret;
@@ -1209,48 +1116,73 @@ int dsa_slave_create(struct dsa_switch *ds, struct device *parent,
netif_carrier_off(slave_dev);
+ ret = dsa_slave_phy_setup(p, slave_dev);
+ if (ret) {
+ netdev_err(master, "error %d setting up slave phy\n", ret);
+ unregister_netdev(slave_dev);
+ free_netdev(slave_dev);
+ return ret;
+ }
+
return 0;
}
+void dsa_slave_destroy(struct net_device *slave_dev)
+{
+ struct dsa_slave_priv *p = netdev_priv(slave_dev);
+
+ netif_carrier_off(slave_dev);
+ if (p->phy)
+ phy_disconnect(p->phy);
+ unregister_netdev(slave_dev);
+ free_netdev(slave_dev);
+}
+
static bool dsa_slave_dev_check(struct net_device *dev)
{
return dev->netdev_ops == &dsa_slave_netdev_ops;
}
-static int dsa_slave_master_changed(struct net_device *dev)
+static int dsa_slave_port_upper_event(struct net_device *dev,
+ unsigned long event, void *ptr)
{
- struct net_device *master = netdev_master_upper_dev_get(dev);
- struct dsa_slave_priv *p = netdev_priv(dev);
+ struct netdev_notifier_changeupper_info *info = ptr;
+ struct net_device *upper = info->upper_dev;
int err = 0;
- if (master && master->rtnl_link_ops &&
- !strcmp(master->rtnl_link_ops->kind, "bridge"))
- err = dsa_slave_bridge_port_join(dev, master);
- else if (dsa_port_is_bridged(p))
- err = dsa_slave_bridge_port_leave(dev);
+ switch (event) {
+ case NETDEV_CHANGEUPPER:
+ if (netif_is_bridge_master(upper)) {
+ if (info->linking)
+ err = dsa_slave_bridge_port_join(dev, upper);
+ else
+ dsa_slave_bridge_port_leave(dev);
+ }
+
+ break;
+ }
- return err;
+ return notifier_from_errno(err);
}
-int dsa_slave_netdevice_event(struct notifier_block *unused,
- unsigned long event, void *ptr)
+static int dsa_slave_port_event(struct net_device *dev, unsigned long event,
+ void *ptr)
{
- struct net_device *dev;
- int err = 0;
-
switch (event) {
case NETDEV_CHANGEUPPER:
- dev = netdev_notifier_info_to_dev(ptr);
- if (!dsa_slave_dev_check(dev))
- goto out;
+ return dsa_slave_port_upper_event(dev, event, ptr);
+ }
- err = dsa_slave_master_changed(dev);
- if (err && err != -EOPNOTSUPP)
- netdev_warn(dev, "failed to reflect master change\n");
+ return NOTIFY_DONE;
+}
- break;
- }
+int dsa_slave_netdevice_event(struct notifier_block *unused,
+ unsigned long event, void *ptr)
+{
+ struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+
+ if (dsa_slave_dev_check(dev))
+ return dsa_slave_port_event(dev, event, ptr);
-out:
return NOTIFY_DONE;
}
diff --git a/net/ethernet/eth.c b/net/ethernet/eth.c
index 9e63f252a89e..66dff5e3d772 100644
--- a/net/ethernet/eth.c
+++ b/net/ethernet/eth.c
@@ -52,6 +52,8 @@
#include <linux/errno.h>
#include <linux/init.h>
#include <linux/if_ether.h>
+#include <linux/of_net.h>
+#include <linux/pci.h>
#include <net/dst.h>
#include <net/arp.h>
#include <net/sock.h>
@@ -123,6 +125,7 @@ EXPORT_SYMBOL(eth_header);
*/
u32 eth_get_headlen(void *data, unsigned int len)
{
+ const unsigned int flags = FLOW_DISSECTOR_F_PARSE_1ST_FRAG;
const struct ethhdr *eth = (const struct ethhdr *)data;
struct flow_keys keys;
@@ -132,7 +135,7 @@ u32 eth_get_headlen(void *data, unsigned int len)
/* parse any remaining L2/L3 headers, check for L4 */
if (!skb_flow_dissect_flow_keys_buf(&keys, data, eth->h_proto,
- sizeof(*eth), len, 0))
+ sizeof(*eth), len, flags))
return max_t(u32, keys.control.thoff, sizeof(*eth));
/* parse for any L4 headers */
@@ -485,3 +488,32 @@ static int __init eth_offload_init(void)
}
fs_initcall(eth_offload_init);
+
+unsigned char * __weak arch_get_platform_mac_address(void)
+{
+ return NULL;
+}
+
+int eth_platform_get_mac_address(struct device *dev, u8 *mac_addr)
+{
+ const unsigned char *addr;
+ struct device_node *dp;
+
+ if (dev_is_pci(dev))
+ dp = pci_device_to_OF_node(to_pci_dev(dev));
+ else
+ dp = dev->of_node;
+
+ addr = NULL;
+ if (dp)
+ addr = of_get_mac_address(dp);
+ if (!addr)
+ addr = arch_get_platform_mac_address();
+
+ if (!addr)
+ return -ENODEV;
+
+ ether_addr_copy(mac_addr, addr);
+ return 0;
+}
+EXPORT_SYMBOL(eth_platform_get_mac_address);
diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index 35a9788bb3ae..c7d1adca30d8 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -312,7 +312,7 @@ static void send_hsr_supervision_frame(struct hsr_port *master, u8 type)
return;
out:
- WARN_ON_ONCE("HSR: Could not send supervision frame\n");
+ WARN_ONCE(1, "HSR: Could not send supervision frame\n");
kfree_skb(skb);
}
diff --git a/net/ieee802154/6lowpan/core.c b/net/ieee802154/6lowpan/core.c
index 20c49c724ba0..0023c9048812 100644
--- a/net/ieee802154/6lowpan/core.c
+++ b/net/ieee802154/6lowpan/core.c
@@ -161,9 +161,7 @@ static int lowpan_newlink(struct net *src_net, struct net_device *ldev,
wdev->needed_headroom;
ldev->needed_tailroom = wdev->needed_tailroom;
- lowpan_netdev_setup(ldev, LOWPAN_LLTYPE_IEEE802154);
-
- ret = register_netdevice(ldev);
+ ret = lowpan_register_netdevice(ldev, LOWPAN_LLTYPE_IEEE802154);
if (ret < 0) {
dev_put(wdev);
return ret;
@@ -180,7 +178,7 @@ static void lowpan_dellink(struct net_device *ldev, struct list_head *head)
ASSERT_RTNL();
wdev->ieee802154_ptr->lowpan_dev = NULL;
- unregister_netdevice(ldev);
+ lowpan_unregister_netdevice(ldev);
dev_put(wdev);
}
@@ -209,7 +207,7 @@ static int lowpan_device_event(struct notifier_block *unused,
struct net_device *wdev = netdev_notifier_info_to_dev(ptr);
if (wdev->type != ARPHRD_IEEE802154)
- goto out;
+ return NOTIFY_DONE;
switch (event) {
case NETDEV_UNREGISTER:
@@ -221,11 +219,10 @@ static int lowpan_device_event(struct notifier_block *unused,
lowpan_dellink(wdev->ieee802154_ptr->lowpan_dev, NULL);
break;
default:
- break;
+ return NOTIFY_DONE;
}
-out:
- return NOTIFY_DONE;
+ return NOTIFY_OK;
}
static struct notifier_block lowpan_dev_notifier = {
diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c
index 6b437e8760d3..30d875dff6b5 100644
--- a/net/ieee802154/6lowpan/reassembly.c
+++ b/net/ieee802154/6lowpan/reassembly.c
@@ -624,7 +624,6 @@ int __init lowpan_net_frag_init(void)
lowpan_frags.hashfn = lowpan_hashfn;
lowpan_frags.constructor = lowpan_frag_init;
lowpan_frags.destructor = NULL;
- lowpan_frags.skb_free = NULL;
lowpan_frags.qsize = sizeof(struct frag_queue);
lowpan_frags.match = lowpan_frag_match;
lowpan_frags.frag_expire = lowpan_frag_expire;
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index a548be247e15..e0bd013a1e5e 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -182,12 +182,14 @@ static int ieee802154_sock_ioctl(struct socket *sock, unsigned int cmd,
static HLIST_HEAD(raw_head);
static DEFINE_RWLOCK(raw_lock);
-static void raw_hash(struct sock *sk)
+static int raw_hash(struct sock *sk)
{
write_lock_bh(&raw_lock);
sk_add_node(sk, &raw_head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock_bh(&raw_lock);
+
+ return 0;
}
static void raw_unhash(struct sock *sk)
@@ -462,12 +464,14 @@ static inline struct dgram_sock *dgram_sk(const struct sock *sk)
return container_of(sk, struct dgram_sock, sk);
}
-static void dgram_hash(struct sock *sk)
+static int dgram_hash(struct sock *sk)
{
write_lock_bh(&dgram_lock);
sk_add_node(sk, &dgram_head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock_bh(&dgram_lock);
+
+ return 0;
}
static void dgram_unhash(struct sock *sk)
@@ -1026,8 +1030,13 @@ static int ieee802154_create(struct net *net, struct socket *sock,
/* Checksums on by default */
sock_set_flag(sk, SOCK_ZAPPED);
- if (sk->sk_prot->hash)
- sk->sk_prot->hash(sk);
+ if (sk->sk_prot->hash) {
+ rc = sk->sk_prot->hash(sk);
+ if (rc) {
+ sk_common_release(sk);
+ goto out;
+ }
+ }
if (sk->sk_prot->init) {
rc = sk->sk_prot->init(sk);
diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 416dfa004cfb..238225b0c970 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -186,6 +186,7 @@ config NET_IPGRE_DEMUX
config NET_IP_TUNNEL
tristate
+ select DST_CACHE
default n
config NET_IPGRE
@@ -353,6 +354,7 @@ config INET_ESP
select CRYPTO_CBC
select CRYPTO_SHA1
select CRYPTO_DES
+ select CRYPTO_ECHAINIV
---help---
Support for IPsec ESP.
@@ -404,14 +406,6 @@ config INET_XFRM_MODE_BEET
If unsure, say Y.
-config INET_LRO
- tristate "Large Receive Offload (ipv4/tcp)"
- default y
- ---help---
- Support for Large Receive Offload (ipv4/tcp).
-
- If unsure, say Y.
-
config INET_DIAG
tristate "INET: socket monitoring interface"
default y
@@ -436,6 +430,19 @@ config INET_UDP_DIAG
Support for UDP socket monitoring interface used by the ss tool.
If unsure, say Y.
+config INET_DIAG_DESTROY
+ bool "INET: allow privileged process to administratively close sockets"
+ depends on INET_DIAG
+ default n
+ ---help---
+ Provides a SOCK_DESTROY operation that allows privileged processes
+ (e.g., a connection manager or a network administration tool such as
+ ss) to close sockets opened by other processes. Closing a socket in
+ this way interrupts any blocking read/write/connect operations on
+ the socket and causes future socket calls to behave as if the socket
+ had been disconnected.
+ If unsure, say N.
+
menuconfig TCP_CONG_ADVANCED
bool "TCP: advanced congestion control"
---help---
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index c29809f765dc..bfa133691cde 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -32,7 +32,6 @@ obj-$(CONFIG_INET_ESP) += esp4.o
obj-$(CONFIG_INET_IPCOMP) += ipcomp.o
obj-$(CONFIG_INET_XFRM_TUNNEL) += xfrm4_tunnel.o
obj-$(CONFIG_INET_XFRM_MODE_BEET) += xfrm4_mode_beet.o
-obj-$(CONFIG_INET_LRO) += inet_lro.o
obj-$(CONFIG_INET_TUNNEL) += tunnel4.o
obj-$(CONFIG_INET_XFRM_MODE_TRANSPORT) += xfrm4_mode_transport.o
obj-$(CONFIG_INET_XFRM_MODE_TUNNEL) += xfrm4_mode_tunnel.o
@@ -56,7 +55,6 @@ obj-$(CONFIG_TCP_CONG_SCALABLE) += tcp_scalable.o
obj-$(CONFIG_TCP_CONG_LP) += tcp_lp.o
obj-$(CONFIG_TCP_CONG_YEAH) += tcp_yeah.o
obj-$(CONFIG_TCP_CONG_ILLINOIS) += tcp_illinois.o
-obj-$(CONFIG_MEMCG_KMEM) += tcp_memcontrol.o
obj-$(CONFIG_NETLABEL) += cipso_ipv4.o
obj-$(CONFIG_XFRM) += xfrm4_policy.o xfrm4_state.o xfrm4_input.o \
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 11c4ca13ec3b..9e481992dbae 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -257,6 +257,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
int try_loading_module = 0;
int err;
+ if (protocol < 0 || protocol >= IPPROTO_MAX)
+ return -EINVAL;
+
sock->state = SS_UNCONNECTED;
/* Look for the requested type/protocol pair. */
@@ -367,7 +370,11 @@ lookup_protocol:
*/
inet->inet_sport = htons(inet->inet_num);
/* Add to protocol hash chains. */
- sk->sk_prot->hash(sk);
+ err = sk->sk_prot->hash(sk);
+ if (err) {
+ sk_common_release(sk);
+ goto out;
+ }
}
if (sk->sk_prot->init) {
@@ -1088,12 +1095,6 @@ void inet_unregister_protosw(struct inet_protosw *p)
}
EXPORT_SYMBOL(inet_unregister_protosw);
-/*
- * Shall we try to damage output packets if routing dev changes?
- */
-
-int sysctl_ip_dynaddr __read_mostly;
-
static int inet_sk_reselect_saddr(struct sock *sk)
{
struct inet_sock *inet = inet_sk(sk);
@@ -1124,7 +1125,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
if (new_saddr == old_saddr)
return 0;
- if (sysctl_ip_dynaddr > 1) {
+ if (sock_net(sk)->ipv4.sysctl_ip_dynaddr > 1) {
pr_info("%s(): shifting inet->saddr from %pI4 to %pI4\n",
__func__, &old_saddr, &new_saddr);
}
@@ -1139,8 +1140,7 @@ static int inet_sk_reselect_saddr(struct sock *sk)
* Besides that, it does not check for connection
* uniqueness. Wait for troubles.
*/
- __sk_prot_rehash(sk);
- return 0;
+ return __sk_prot_rehash(sk);
}
int inet_sk_rebuild_header(struct sock *sk)
@@ -1180,7 +1180,7 @@ int inet_sk_rebuild_header(struct sock *sk)
* Other protocols have to map its equivalent state to TCP_SYN_SENT.
* DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
*/
- if (!sysctl_ip_dynaddr ||
+ if (!sock_net(sk)->ipv4.sysctl_ip_dynaddr ||
sk->sk_state != TCP_SYN_SENT ||
(sk->sk_userlocks & SOCK_BINDADDR_LOCK) ||
(err = inet_sk_reselect_saddr(sk)) != 0)
@@ -1380,6 +1380,45 @@ out:
return pp;
}
+static struct sk_buff **ipip_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ if (NAPI_GRO_CB(skb)->encap_mark) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ NAPI_GRO_CB(skb)->encap_mark = 1;
+
+ return inet_gro_receive(head, skb);
+}
+
+#define SECONDS_PER_DAY 86400
+
+/* inet_current_timestamp - Return IP network timestamp
+ *
+ * Return milliseconds since midnight in network byte order.
+ */
+__be32 inet_current_timestamp(void)
+{
+ u32 secs;
+ u32 msecs;
+ struct timespec64 ts;
+
+ ktime_get_real_ts64(&ts);
+
+ /* Get secs since midnight. */
+ (void)div_u64_rem(ts.tv_sec, SECONDS_PER_DAY, &secs);
+ /* Convert to msecs. */
+ msecs = secs * MSEC_PER_SEC;
+ /* Convert nsec to msec. */
+ msecs += (u32)ts.tv_nsec / NSEC_PER_MSEC;
+
+ /* Convert to network byte order. */
+ return htonl(msecs);
+}
+EXPORT_SYMBOL(inet_current_timestamp);
+
int inet_recv_error(struct sock *sk, struct msghdr *msg, int len, int *addr_len)
{
if (sk->sk_family == AF_INET)
@@ -1422,6 +1461,13 @@ out_unlock:
return err;
}
+static int ipip_gro_complete(struct sk_buff *skb, int nhoff)
+{
+ skb->encapsulation = 1;
+ skb_shinfo(skb)->gso_type |= SKB_GSO_IPIP;
+ return inet_gro_complete(skb, nhoff);
+}
+
int inet_ctl_sock_create(struct sock **sk, unsigned short family,
unsigned short type, unsigned char protocol,
struct net *net)
@@ -1649,8 +1695,8 @@ static struct packet_offload ip_packet_offload __read_mostly = {
static const struct net_offload ipip_offload = {
.callbacks = {
.gso_segment = inet_gso_segment,
- .gro_receive = inet_gro_receive,
- .gro_complete = inet_gro_complete,
+ .gro_receive = ipip_gro_receive,
+ .gro_complete = ipip_gro_complete,
},
};
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 59b3e0e8fd51..c34c7544d1db 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -665,7 +665,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
*/
if (!in_dev)
- goto out;
+ goto out_free_skb;
arp = arp_hdr(skb);
@@ -673,7 +673,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
default:
if (arp->ar_pro != htons(ETH_P_IP) ||
htons(dev_type) != arp->ar_hrd)
- goto out;
+ goto out_free_skb;
break;
case ARPHRD_ETHER:
case ARPHRD_FDDI:
@@ -690,17 +690,17 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
if ((arp->ar_hrd != htons(ARPHRD_ETHER) &&
arp->ar_hrd != htons(ARPHRD_IEEE802)) ||
arp->ar_pro != htons(ETH_P_IP))
- goto out;
+ goto out_free_skb;
break;
case ARPHRD_AX25:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_AX25))
- goto out;
+ goto out_free_skb;
break;
case ARPHRD_NETROM:
if (arp->ar_pro != htons(AX25_P_IP) ||
arp->ar_hrd != htons(ARPHRD_NETROM))
- goto out;
+ goto out_free_skb;
break;
}
@@ -708,7 +708,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
if (arp->ar_op != htons(ARPOP_REPLY) &&
arp->ar_op != htons(ARPOP_REQUEST))
- goto out;
+ goto out_free_skb;
/*
* Extract fields
@@ -733,7 +733,15 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
*/
if (ipv4_is_multicast(tip) ||
(!IN_DEV_ROUTE_LOCALNET(in_dev) && ipv4_is_loopback(tip)))
- goto out;
+ goto out_free_skb;
+
+ /*
+ * For some 802.11 wireless deployments (and possibly other networks),
+ * there will be an ARP proxy and gratuitous ARP frames are attacks
+ * and thus should not be accepted.
+ */
+ if (sip == tip && IN_DEV_ORCONF(in_dev, DROP_GRATUITOUS_ARP))
+ goto out_free_skb;
/*
* Special case: We must set Frame Relay source Q.922 address
@@ -770,7 +778,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
!arp_ignore(in_dev, sip, tip))
arp_send_dst(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip,
sha, dev->dev_addr, sha, reply_dst);
- goto out;
+ goto out_consume_skb;
}
if (arp->ar_op == htons(ARPOP_REQUEST) &&
@@ -795,7 +803,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
neigh_release(n);
}
}
- goto out;
+ goto out_consume_skb;
} else if (IN_DEV_FORWARD(in_dev)) {
if (addr_type == RTN_UNICAST &&
(arp_fwd_proxy(in_dev, dev, rt) ||
@@ -818,7 +826,7 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
in_dev->arp_parms, skb);
goto out_free_dst;
}
- goto out;
+ goto out_consume_skb;
}
}
}
@@ -868,11 +876,16 @@ static int arp_process(struct net *net, struct sock *sk, struct sk_buff *skb)
neigh_release(n);
}
-out:
+out_consume_skb:
consume_skb(skb);
+
out_free_dst:
dst_release(reply_dst);
- return 0;
+ return NET_RX_SUCCESS;
+
+out_free_skb:
+ kfree_skb(skb);
+ return NET_RX_DROP;
}
static void parp_redo(struct sk_buff *skb)
@@ -916,11 +929,11 @@ static int arp_rcv(struct sk_buff *skb, struct net_device *dev,
consumeskb:
consume_skb(skb);
- return 0;
+ return NET_RX_SUCCESS;
freeskb:
kfree_skb(skb);
out_of_mem:
- return 0;
+ return NET_RX_DROP;
}
/*
diff --git a/net/ipv4/devinet.c b/net/ipv4/devinet.c
index cebd9d31e65a..e333bc86bd39 100644
--- a/net/ipv4/devinet.c
+++ b/net/ipv4/devinet.c
@@ -334,6 +334,9 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
ASSERT_RTNL();
+ if (in_dev->dead)
+ goto no_promotions;
+
/* 1. Deleting primary ifaddr forces deletion all secondaries
* unless alias promotion is set
**/
@@ -380,6 +383,7 @@ static void __inet_del_ifa(struct in_device *in_dev, struct in_ifaddr **ifap,
fib_del_ifaddr(ifa, ifa1);
}
+no_promotions:
/* 2. Unlink it */
*ifap = ifa1->ifa_next;
@@ -1194,6 +1198,7 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
__be32 addr = 0;
struct in_device *in_dev;
struct net *net = dev_net(dev);
+ int master_idx;
rcu_read_lock();
in_dev = __in_dev_get_rcu(dev);
@@ -1214,12 +1219,33 @@ __be32 inet_select_addr(const struct net_device *dev, __be32 dst, int scope)
if (addr)
goto out_unlock;
no_in_dev:
+ master_idx = l3mdev_master_ifindex_rcu(dev);
+
+ /* For VRFs, the VRF device takes the place of the loopback device,
+ * with addresses on it being preferred. Note in such cases the
+ * loopback device will be among the devices that fail the master_idx
+ * equality check in the loop below.
+ */
+ if (master_idx &&
+ (dev = dev_get_by_index_rcu(net, master_idx)) &&
+ (in_dev = __in_dev_get_rcu(dev))) {
+ for_primary_ifa(in_dev) {
+ if (ifa->ifa_scope != RT_SCOPE_LINK &&
+ ifa->ifa_scope <= scope) {
+ addr = ifa->ifa_local;
+ goto out_unlock;
+ }
+ } endfor_ifa(in_dev);
+ }
/* Not loopback addresses on loopback should be preferred
in this case. It is important that lo is the first interface
in dev_base list.
*/
for_each_netdev_rcu(net, dev) {
+ if (l3mdev_master_ifindex_rcu(dev) != master_idx)
+ continue;
+
in_dev = __in_dev_get_rcu(dev);
if (!in_dev)
continue;
@@ -1731,17 +1757,20 @@ static int inet_netconf_msgsize_devconf(int type)
{
int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+ nla_total_size(4); /* NETCONFA_IFINDEX */
+ bool all = false;
+
+ if (type == NETCONFA_ALL)
+ all = true;
- /* type -1 is used for ALL */
- if (type == -1 || type == NETCONFA_FORWARDING)
+ if (all || type == NETCONFA_FORWARDING)
size += nla_total_size(4);
- if (type == -1 || type == NETCONFA_RP_FILTER)
+ if (all || type == NETCONFA_RP_FILTER)
size += nla_total_size(4);
- if (type == -1 || type == NETCONFA_MC_FORWARDING)
+ if (all || type == NETCONFA_MC_FORWARDING)
size += nla_total_size(4);
- if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+ if (all || type == NETCONFA_PROXY_NEIGH)
size += nla_total_size(4);
- if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+ if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
size += nla_total_size(4);
return size;
@@ -1754,36 +1783,39 @@ static int inet_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
{
struct nlmsghdr *nlh;
struct netconfmsg *ncm;
+ bool all = false;
nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
flags);
if (!nlh)
return -EMSGSIZE;
+ if (type == NETCONFA_ALL)
+ all = true;
+
ncm = nlmsg_data(nlh);
ncm->ncm_family = AF_INET;
if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
goto nla_put_failure;
- /* type -1 is used for ALL */
- if ((type == -1 || type == NETCONFA_FORWARDING) &&
+ if ((all || type == NETCONFA_FORWARDING) &&
nla_put_s32(skb, NETCONFA_FORWARDING,
IPV4_DEVCONF(*devconf, FORWARDING)) < 0)
goto nla_put_failure;
- if ((type == -1 || type == NETCONFA_RP_FILTER) &&
+ if ((all || type == NETCONFA_RP_FILTER) &&
nla_put_s32(skb, NETCONFA_RP_FILTER,
IPV4_DEVCONF(*devconf, RP_FILTER)) < 0)
goto nla_put_failure;
- if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+ if ((all || type == NETCONFA_MC_FORWARDING) &&
nla_put_s32(skb, NETCONFA_MC_FORWARDING,
IPV4_DEVCONF(*devconf, MC_FORWARDING)) < 0)
goto nla_put_failure;
- if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+ if ((all || type == NETCONFA_PROXY_NEIGH) &&
nla_put_s32(skb, NETCONFA_PROXY_NEIGH,
IPV4_DEVCONF(*devconf, PROXY_ARP)) < 0)
goto nla_put_failure;
- if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+ if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
IPV4_DEVCONF(*devconf, IGNORE_ROUTES_WITH_LINKDOWN)) < 0)
goto nla_put_failure;
@@ -1847,7 +1879,7 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
if (err < 0)
goto errout;
- err = EINVAL;
+ err = -EINVAL;
if (!tb[NETCONFA_IFINDEX])
goto errout;
@@ -1871,14 +1903,14 @@ static int inet_netconf_get_devconf(struct sk_buff *in_skb,
}
err = -ENOBUFS;
- skb = nlmsg_new(inet_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+ skb = nlmsg_new(inet_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC);
if (!skb)
goto errout;
err = inet_netconf_fill_devconf(skb, ifindex, devconf,
NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
- -1);
+ NETCONFA_ALL);
if (err < 0) {
/* -EMSGSIZE implies BUG in inet_netconf_msgsize_devconf() */
WARN_ON(err == -EMSGSIZE);
@@ -1922,7 +1954,7 @@ static int inet_netconf_dump_devconf(struct sk_buff *skb,
cb->nlh->nlmsg_seq,
RTM_NEWNETCONF,
NLM_F_MULTI,
- -1) < 0) {
+ NETCONFA_ALL) < 0) {
rcu_read_unlock();
goto done;
}
@@ -1938,7 +1970,7 @@ cont:
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNETCONF, NLM_F_MULTI,
- -1) < 0)
+ NETCONFA_ALL) < 0)
goto done;
else
h++;
@@ -1949,7 +1981,7 @@ cont:
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNETCONF, NLM_F_MULTI,
- -1) < 0)
+ NETCONFA_ALL) < 0)
goto done;
else
h++;
@@ -2185,6 +2217,8 @@ static struct devinet_sysctl_table {
"igmpv3_unsolicited_report_interval"),
DEVINET_SYSCTL_RW_ENTRY(IGNORE_ROUTES_WITH_LINKDOWN,
"ignore_routes_with_linkdown"),
+ DEVINET_SYSCTL_RW_ENTRY(DROP_GRATUITOUS_ARP,
+ "drop_gratuitous_arp"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOXFRM, "disable_xfrm"),
DEVINET_SYSCTL_FLUSHING_ENTRY(NOPOLICY, "disable_policy"),
@@ -2192,6 +2226,8 @@ static struct devinet_sysctl_table {
"promote_secondaries"),
DEVINET_SYSCTL_FLUSHING_ENTRY(ROUTE_LOCALNET,
"route_localnet"),
+ DEVINET_SYSCTL_FLUSHING_ENTRY(DROP_UNICAST_IN_L2_MULTICAST,
+ "drop_unicast_in_l2_multicast"),
},
};
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index cc8f3e506cde..8a9246deccfe 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -280,7 +280,6 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
struct in_device *in_dev;
struct fib_result res;
struct rtable *rt;
- struct flowi4 fl4;
struct net *net;
int scope;
@@ -296,14 +295,13 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
scope = RT_SCOPE_UNIVERSE;
if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
- fl4.flowi4_oif = 0;
- fl4.flowi4_iif = LOOPBACK_IFINDEX;
- fl4.daddr = ip_hdr(skb)->saddr;
- fl4.saddr = 0;
- fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
- fl4.flowi4_scope = scope;
- fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
- fl4.flowi4_tun_key.tun_id = 0;
+ struct flowi4 fl4 = {
+ .flowi4_iif = LOOPBACK_IFINDEX,
+ .daddr = ip_hdr(skb)->saddr,
+ .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
+ .flowi4_scope = scope,
+ .flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0,
+ };
if (!fib_lookup(net, &fl4, &res, 0))
return FIB_RES_PREFSRC(net, res);
} else {
@@ -922,6 +920,9 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
subnet = 1;
}
+ if (in_dev->dead)
+ goto no_promotions;
+
/* Deletion is more complicated than add.
* We should take care of not to delete too much :-)
*
@@ -997,6 +998,7 @@ void fib_del_ifaddr(struct in_ifaddr *ifa, struct in_ifaddr *iprim)
}
}
+no_promotions:
if (!(ok & BRD_OK))
fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim);
if (subnet && ifa->ifa_prefixlen < 31) {
@@ -1155,6 +1157,7 @@ static int fib_inetaddr_event(struct notifier_block *this, unsigned long event,
static int fib_netdev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
+ struct netdev_notifier_changeupper_info *info;
struct in_device *in_dev;
struct net *net = dev_net(dev);
unsigned int flags;
@@ -1193,6 +1196,14 @@ static int fib_netdev_event(struct notifier_block *this, unsigned long event, vo
case NETDEV_CHANGEMTU:
rt_cache_flush(net);
break;
+ case NETDEV_CHANGEUPPER:
+ info = ptr;
+ /* flush all routes if dev is linked to or unlinked from
+ * an L3 master device (e.g., VRF)
+ */
+ if (info->upper_dev && netif_is_l3_master(info->upper_dev))
+ fib_disable_ip(dev, NETDEV_DOWN, true);
+ break;
}
return NOTIFY_DONE;
}
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 3e87447e65c7..d97268e8ff10 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -923,14 +923,21 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
if (cfg->fc_type != RTN_LOCAL || !cfg->fc_dst ||
fib_prefsrc != cfg->fc_dst) {
u32 tb_id = cfg->fc_table;
+ int rc;
if (tb_id == RT_TABLE_MAIN)
tb_id = RT_TABLE_LOCAL;
- if (inet_addr_type_table(cfg->fc_nlinfo.nl_net,
- fib_prefsrc, tb_id) != RTN_LOCAL) {
- return false;
+ rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+ fib_prefsrc, tb_id);
+
+ if (rc != RTN_LOCAL && tb_id != RT_TABLE_LOCAL) {
+ rc = inet_addr_type_table(cfg->fc_nlinfo.nl_net,
+ fib_prefsrc, RT_TABLE_LOCAL);
}
+
+ if (rc != RTN_LOCAL)
+ return false;
}
return true;
}
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 744e5936c10d..d07fc076bea0 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -289,10 +289,8 @@ static void __node_free_rcu(struct rcu_head *head)
if (!n->tn_bits)
kmem_cache_free(trie_leaf_kmem, n);
- else if (n->tn_bits <= TNODE_KMALLOC_MAX)
- kfree(n);
else
- vfree(n);
+ kvfree(n);
}
#define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu)
@@ -1396,9 +1394,10 @@ found:
struct fib_info *fi = fa->fa_info;
int nhsel, err;
- if ((index >= (1ul << fa->fa_slen)) &&
- ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen != KEYLENGTH)))
- continue;
+ if ((BITS_PER_LONG > KEYLENGTH) || (fa->fa_slen < KEYLENGTH)) {
+ if (index >= (1ul << fa->fa_slen))
+ continue;
+ }
if (fa->fa_tos && fa->fa_tos != flp->flowi4_tos)
continue;
if (fi->fib_dead)
diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index e0fcbbbcfe54..a39068b4a4d9 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -24,6 +24,7 @@ struct fou {
u16 type;
struct udp_offload udp_offloads;
struct list_head list;
+ struct rcu_head rcu;
};
#define FOU_F_REMCSUM_NOPARTIAL BIT(0)
@@ -47,7 +48,7 @@ static inline struct fou *fou_from_sock(struct sock *sk)
return sk->sk_user_data;
}
-static void fou_recv_pull(struct sk_buff *skb, size_t len)
+static int fou_recv_pull(struct sk_buff *skb, size_t len)
{
struct iphdr *iph = ip_hdr(skb);
@@ -58,6 +59,7 @@ static void fou_recv_pull(struct sk_buff *skb, size_t len)
__skb_pull(skb, len);
skb_postpull_rcsum(skb, udp_hdr(skb), len);
skb_reset_transport_header(skb);
+ return iptunnel_pull_offloads(skb);
}
static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
@@ -67,9 +69,14 @@ static int fou_udp_recv(struct sock *sk, struct sk_buff *skb)
if (!fou)
return 1;
- fou_recv_pull(skb, sizeof(struct udphdr));
+ if (fou_recv_pull(skb, sizeof(struct udphdr)))
+ goto drop;
return -fou->protocol;
+
+drop:
+ kfree_skb(skb);
+ return 0;
}
static struct guehdr *gue_remcsum(struct sk_buff *skb, struct guehdr *guehdr,
@@ -169,6 +176,9 @@ static int gue_udp_recv(struct sock *sk, struct sk_buff *skb)
__skb_pull(skb, sizeof(struct udphdr) + hdrlen);
skb_reset_transport_header(skb);
+ if (iptunnel_pull_offloads(skb))
+ goto drop;
+
return -guehdr->proto_ctype;
drop:
@@ -185,6 +195,17 @@ static struct sk_buff **fou_gro_receive(struct sk_buff **head,
u8 proto = NAPI_GRO_CB(skb)->proto;
const struct net_offload **offloads;
+ /* We can clear the encap_mark for FOU as we are essentially doing
+ * one of two possible things. We are either adding an L4 tunnel
+ * header to the outer L3 tunnel header, or we are are simply
+ * treating the GRE tunnel header as though it is a UDP protocol
+ * specific header such as VXLAN or GENEVE.
+ */
+ NAPI_GRO_CB(skb)->encap_mark = 0;
+
+ /* Flag this frame as already having an outer encap header */
+ NAPI_GRO_CB(skb)->is_fou = 1;
+
rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[proto]);
@@ -318,8 +339,6 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
skb_gro_pull(skb, hdrlen);
- flush = 0;
-
for (p = *head; p; p = p->next) {
const struct guehdr *guehdr2;
@@ -344,6 +363,17 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
}
}
+ /* We can clear the encap_mark for GUE as we are essentially doing
+ * one of two possible things. We are either adding an L4 tunnel
+ * header to the outer L3 tunnel header, or we are are simply
+ * treating the GRE tunnel header as though it is a UDP protocol
+ * specific header such as VXLAN or GENEVE.
+ */
+ NAPI_GRO_CB(skb)->encap_mark = 0;
+
+ /* Flag this frame as already having an outer encap header */
+ NAPI_GRO_CB(skb)->is_fou = 1;
+
rcu_read_lock();
offloads = NAPI_GRO_CB(skb)->is_ipv6 ? inet6_offloads : inet_offloads;
ops = rcu_dereference(offloads[guehdr->proto_ctype]);
@@ -351,6 +381,7 @@ static struct sk_buff **gue_gro_receive(struct sk_buff **head,
goto out_unlock;
pp = ops->callbacks.gro_receive(head, skb);
+ flush = 0;
out_unlock:
rcu_read_unlock();
@@ -417,7 +448,7 @@ static void fou_release(struct fou *fou)
list_del(&fou->list);
udp_tunnel_sock_release(sock);
- kfree(fou);
+ kfree_rcu(fou, rcu);
}
static int fou_encap_init(struct sock *sk, struct fou *fou, struct fou_cfg *cfg)
@@ -497,7 +528,7 @@ static int fou_create(struct net *net, struct fou_cfg *cfg,
sk->sk_allocation = GFP_ATOMIC;
if (cfg->udp_config.family == AF_INET) {
- err = udp_add_offload(&fou->udp_offloads);
+ err = udp_add_offload(net, &fou->udp_offloads);
if (err)
goto error;
}
@@ -773,7 +804,6 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
uh->dest = e->dport;
uh->source = sport;
uh->len = htons(skb->len);
- uh->check = 0;
udp_set_csum(!(e->flags & TUNNEL_ENCAP_FLAG_CSUM), skb,
fl4->saddr, fl4->daddr, skb->len);
@@ -783,11 +813,11 @@ static void fou_build_udp(struct sk_buff *skb, struct ip_tunnel_encap *e,
int fou_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
u8 *protocol, struct flowi4 *fl4)
{
- bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
- int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+ int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
+ SKB_GSO_UDP_TUNNEL;
__be16 sport;
- skb = iptunnel_handle_offloads(skb, csum, type);
+ skb = iptunnel_handle_offloads(skb, type);
if (IS_ERR(skb))
return PTR_ERR(skb);
@@ -803,8 +833,8 @@ EXPORT_SYMBOL(fou_build_header);
int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
u8 *protocol, struct flowi4 *fl4)
{
- bool csum = !!(e->flags & TUNNEL_ENCAP_FLAG_CSUM);
- int type = csum ? SKB_GSO_UDP_TUNNEL_CSUM : SKB_GSO_UDP_TUNNEL;
+ int type = e->flags & TUNNEL_ENCAP_FLAG_CSUM ? SKB_GSO_UDP_TUNNEL_CSUM :
+ SKB_GSO_UDP_TUNNEL;
struct guehdr *guehdr;
size_t hdrlen, optlen = 0;
__be16 sport;
@@ -813,7 +843,6 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
if ((e->flags & TUNNEL_ENCAP_FLAG_REMCSUM) &&
skb->ip_summed == CHECKSUM_PARTIAL) {
- csum = false;
optlen += GUE_PLEN_REMCSUM;
type |= SKB_GSO_TUNNEL_REMCSUM;
need_priv = true;
@@ -821,7 +850,7 @@ int gue_build_header(struct sk_buff *skb, struct ip_tunnel_encap *e,
optlen += need_priv ? GUE_LEN_PRIV : 0;
- skb = iptunnel_handle_offloads(skb, csum, type);
+ skb = iptunnel_handle_offloads(skb, type);
if (IS_ERR(skb))
return PTR_ERR(skb);
diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index 5a8ee3282550..6a5bd4317866 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -18,15 +18,13 @@
static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
netdev_features_t features)
{
+ int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
struct sk_buff *segs = ERR_PTR(-EINVAL);
- netdev_features_t enc_features;
- int ghl;
- struct gre_base_hdr *greh;
u16 mac_offset = skb->mac_header;
- int mac_len = skb->mac_len;
__be16 protocol = skb->protocol;
- int tnl_hlen;
- bool csum;
+ u16 mac_len = skb->mac_len;
+ int gre_offset, outer_hlen;
+ bool need_csum, ufo;
if (unlikely(skb_shinfo(skb)->gso_type &
~(SKB_GSO_TCPV4 |
@@ -43,74 +41,75 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
if (!skb->encapsulation)
goto out;
- if (unlikely(!pskb_may_pull(skb, sizeof(*greh))))
+ if (unlikely(tnl_hlen < sizeof(struct gre_base_hdr)))
goto out;
- greh = (struct gre_base_hdr *)skb_transport_header(skb);
-
- ghl = skb_inner_mac_header(skb) - skb_transport_header(skb);
- if (unlikely(ghl < sizeof(*greh)))
+ if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
goto out;
- csum = !!(greh->flags & GRE_CSUM);
- if (csum)
- skb->encap_hdr_csum = 1;
-
/* setup inner skb. */
- skb->protocol = greh->protocol;
skb->encapsulation = 0;
-
- if (unlikely(!pskb_may_pull(skb, ghl)))
- goto out;
-
- __skb_pull(skb, ghl);
+ SKB_GSO_CB(skb)->encap_level = 0;
+ __skb_pull(skb, tnl_hlen);
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb_inner_network_offset(skb));
skb->mac_len = skb_inner_network_offset(skb);
+ skb->protocol = skb->inner_protocol;
+
+ need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_GRE_CSUM);
+ skb->encap_hdr_csum = need_csum;
+
+ ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+
+ features &= skb->dev->hw_enc_features;
+
+ /* The only checksum offload we care about from here on out is the
+ * outer one so strip the existing checksum feature flags based
+ * on the fact that we will be computing our checksum in software.
+ */
+ if (ufo) {
+ features &= ~NETIF_F_CSUM_MASK;
+ if (!need_csum)
+ features |= NETIF_F_HW_CSUM;
+ }
/* segment inner packet. */
- enc_features = skb->dev->hw_enc_features & features;
- segs = skb_mac_gso_segment(skb, enc_features);
+ segs = skb_mac_gso_segment(skb, features);
if (IS_ERR_OR_NULL(segs)) {
- skb_gso_error_unwind(skb, protocol, ghl, mac_offset, mac_len);
+ skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
+ mac_len);
goto out;
}
+ outer_hlen = skb_tnl_header_len(skb);
+ gre_offset = outer_hlen - tnl_hlen;
skb = segs;
- tnl_hlen = skb_tnl_header_len(skb);
do {
- __skb_push(skb, ghl);
- if (csum) {
- __be32 *pcsum;
-
- if (skb_has_shared_frag(skb)) {
- int err;
-
- err = __skb_linearize(skb);
- if (err) {
- kfree_skb_list(segs);
- segs = ERR_PTR(err);
- goto out;
- }
- }
+ struct gre_base_hdr *greh;
+ __be32 *pcsum;
- skb_reset_transport_header(skb);
-
- greh = (struct gre_base_hdr *)
- skb_transport_header(skb);
- pcsum = (__be32 *)(greh + 1);
- *pcsum = 0;
- *(__sum16 *)pcsum = gso_make_checksum(skb, 0);
+ /* Set up inner headers if we are offloading inner checksum */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ skb_reset_inner_headers(skb);
+ skb->encapsulation = 1;
}
- __skb_push(skb, tnl_hlen - ghl);
- skb_reset_inner_headers(skb);
- skb->encapsulation = 1;
+ skb->mac_len = mac_len;
+ skb->protocol = protocol;
+ __skb_push(skb, outer_hlen);
skb_reset_mac_header(skb);
skb_set_network_header(skb, mac_len);
- skb->mac_len = mac_len;
- skb->protocol = protocol;
+ skb_set_transport_header(skb, gre_offset);
+
+ if (!need_csum)
+ continue;
+
+ greh = (struct gre_base_hdr *)skb_transport_header(skb);
+ pcsum = (__be32 *)(greh + 1);
+
+ *pcsum = 0;
+ *(__sum16 *)pcsum = gso_make_checksum(skb, 0);
} while ((skb = skb->next));
out:
return segs;
@@ -128,6 +127,11 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
struct packet_offload *ptype;
__be16 type;
+ if (NAPI_GRO_CB(skb)->encap_mark)
+ goto out;
+
+ NAPI_GRO_CB(skb)->encap_mark = 1;
+
off = skb_gro_offset(skb);
hlen = off + sizeof(*greh);
greh = skb_gro_header_fast(skb, off);
@@ -146,6 +150,14 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
if ((greh->flags & ~(GRE_KEY|GRE_CSUM)) != 0)
goto out;
+ /* We can only support GRE_CSUM if we can track the location of
+ * the GRE header. In the case of FOU/GUE we cannot because the
+ * outer UDP header displaces the GRE header leaving us in a state
+ * of limbo.
+ */
+ if ((greh->flags & GRE_CSUM) && NAPI_GRO_CB(skb)->is_fou)
+ goto out;
+
type = greh->protocol;
rcu_read_lock();
@@ -177,8 +189,6 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
null_compute_pseudo);
}
- flush = 0;
-
for (p = *head; p; p = p->next) {
const struct gre_base_hdr *greh2;
@@ -215,6 +225,7 @@ static struct sk_buff **gre_gro_receive(struct sk_buff **head,
skb_gro_postpull_rcsum(skb, greh, grehlen);
pp = ptype->callbacks.gro_receive(head, skb);
+ flush = 0;
out_unlock:
rcu_read_unlock();
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index 36e26977c908..6333489771ed 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -931,7 +931,6 @@ static bool icmp_echo(struct sk_buff *skb)
*/
static bool icmp_timestamp(struct sk_buff *skb)
{
- struct timespec tv;
struct icmp_bxm icmp_param;
/*
* Too short.
@@ -942,9 +941,7 @@ static bool icmp_timestamp(struct sk_buff *skb)
/*
* Fill in the current time as ms since midnight UT:
*/
- getnstimeofday(&tv);
- icmp_param.data.times[1] = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC +
- tv.tv_nsec / NSEC_PER_MSEC);
+ icmp_param.data.times[1] = inet_current_timestamp();
icmp_param.data.times[2] = icmp_param.data.times[1];
if (skb_copy_bits(skb, 0, &icmp_param.data.times[0], 4))
BUG();
diff --git a/net/ipv4/igmp.c b/net/ipv4/igmp.c
index 64aaf3522a59..9b4ca87f70ba 100644
--- a/net/ipv4/igmp.c
+++ b/net/ipv4/igmp.c
@@ -107,12 +107,6 @@
#include <linux/seq_file.h>
#endif
-#define IP_MAX_MEMBERSHIPS 20
-#define IP_MAX_MSF 10
-
-/* IGMP reports for link-local multicast groups are enabled by default */
-int sysctl_igmp_llm_reports __read_mostly = 1;
-
#ifdef CONFIG_IP_MULTICAST
/* Parameter names and values are taken from igmp-v2-06 draft */
@@ -356,9 +350,8 @@ static struct sk_buff *igmpv3_newpack(struct net_device *dev, unsigned int mtu)
skb_dst_set(skb, &rt->dst);
skb->dev = dev;
- skb->reserved_tailroom = skb_end_offset(skb) -
- min(mtu, skb_end_offset(skb));
skb_reserve(skb, hlen);
+ skb_tailroom_reserve(skb, mtu, tlen);
skb_reset_network_header(skb);
pip = ip_hdr(skb);
@@ -433,6 +426,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
int type, int gdeleted, int sdeleted)
{
struct net_device *dev = pmc->interface->dev;
+ struct net *net = dev_net(dev);
struct igmpv3_report *pih;
struct igmpv3_grec *pgr = NULL;
struct ip_sf_list *psf, *psf_next, *psf_prev, **psf_list;
@@ -440,7 +434,7 @@ static struct sk_buff *add_grec(struct sk_buff *skb, struct ip_mc_list *pmc,
if (pmc->multiaddr == IGMP_ALL_HOSTS)
return skb;
- if (ipv4_is_local_multicast(pmc->multiaddr) && !sysctl_igmp_llm_reports)
+ if (ipv4_is_local_multicast(pmc->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
return skb;
isquery = type == IGMPV3_MODE_IS_INCLUDE ||
@@ -543,6 +537,7 @@ empty_source:
static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
{
struct sk_buff *skb = NULL;
+ struct net *net = dev_net(in_dev->dev);
int type;
if (!pmc) {
@@ -551,7 +546,7 @@ static int igmpv3_send_report(struct in_device *in_dev, struct ip_mc_list *pmc)
if (pmc->multiaddr == IGMP_ALL_HOSTS)
continue;
if (ipv4_is_local_multicast(pmc->multiaddr) &&
- !sysctl_igmp_llm_reports)
+ !net->ipv4.sysctl_igmp_llm_reports)
continue;
spin_lock_bh(&pmc->lock);
if (pmc->sfcount[MCAST_EXCLUDE])
@@ -687,7 +682,7 @@ static int igmp_send_report(struct in_device *in_dev, struct ip_mc_list *pmc,
if (type == IGMPV3_HOST_MEMBERSHIP_REPORT)
return igmpv3_send_report(in_dev, pmc);
- if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+ if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports)
return 0;
if (type == IGMP_HOST_LEAVE_MESSAGE)
@@ -766,9 +761,10 @@ static void igmp_ifc_timer_expire(unsigned long data)
static void igmp_ifc_event(struct in_device *in_dev)
{
+ struct net *net = dev_net(in_dev->dev);
if (IGMP_V1_SEEN(in_dev) || IGMP_V2_SEEN(in_dev))
return;
- in_dev->mr_ifc_count = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ in_dev->mr_ifc_count = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
igmp_ifc_start_timer(in_dev, 1);
}
@@ -858,12 +854,13 @@ static int igmp_marksources(struct ip_mc_list *pmc, int nsrcs, __be32 *srcs)
static bool igmp_heard_report(struct in_device *in_dev, __be32 group)
{
struct ip_mc_list *im;
+ struct net *net = dev_net(in_dev->dev);
/* Timers are only set for non-local groups */
if (group == IGMP_ALL_HOSTS)
return false;
- if (ipv4_is_local_multicast(group) && !sysctl_igmp_llm_reports)
+ if (ipv4_is_local_multicast(group) && !net->ipv4.sysctl_igmp_llm_reports)
return false;
rcu_read_lock();
@@ -887,6 +884,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
__be32 group = ih->group;
int max_delay;
int mark = 0;
+ struct net *net = dev_net(in_dev->dev);
if (len == 8) {
@@ -972,7 +970,7 @@ static bool igmp_heard_query(struct in_device *in_dev, struct sk_buff *skb,
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
if (ipv4_is_local_multicast(im->multiaddr) &&
- !sysctl_igmp_llm_reports)
+ !net->ipv4.sysctl_igmp_llm_reports)
continue;
spin_lock_bh(&im->lock);
if (im->tm_running)
@@ -1088,6 +1086,7 @@ static void ip_mc_filter_del(struct in_device *in_dev, __be32 addr)
static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
{
struct ip_mc_list *pmc;
+ struct net *net = dev_net(in_dev->dev);
/* this is an "ip_mc_list" for convenience; only the fields below
* are actually used. In particular, the refcnt and users are not
@@ -1102,7 +1101,7 @@ static void igmpv3_add_delrec(struct in_device *in_dev, struct ip_mc_list *im)
pmc->interface = im->interface;
in_dev_hold(in_dev);
pmc->multiaddr = im->multiaddr;
- pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
pmc->sfmode = im->sfmode;
if (pmc->sfmode == MCAST_INCLUDE) {
struct ip_sf_list *psf;
@@ -1187,6 +1186,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)
{
struct in_device *in_dev = im->interface;
#ifdef CONFIG_IP_MULTICAST
+ struct net *net = dev_net(in_dev->dev);
int reporter;
#endif
@@ -1198,7 +1198,7 @@ static void igmp_group_dropped(struct ip_mc_list *im)
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
- if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+ if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
return;
reporter = im->reporter;
@@ -1223,6 +1223,9 @@ static void igmp_group_dropped(struct ip_mc_list *im)
static void igmp_group_added(struct ip_mc_list *im)
{
struct in_device *in_dev = im->interface;
+#ifdef CONFIG_IP_MULTICAST
+ struct net *net = dev_net(in_dev->dev);
+#endif
if (im->loaded == 0) {
im->loaded = 1;
@@ -1232,7 +1235,7 @@ static void igmp_group_added(struct ip_mc_list *im)
#ifdef CONFIG_IP_MULTICAST
if (im->multiaddr == IGMP_ALL_HOSTS)
return;
- if (ipv4_is_local_multicast(im->multiaddr) && !sysctl_igmp_llm_reports)
+ if (ipv4_is_local_multicast(im->multiaddr) && !net->ipv4.sysctl_igmp_llm_reports)
return;
if (in_dev->dead)
@@ -1245,7 +1248,7 @@ static void igmp_group_added(struct ip_mc_list *im)
}
/* else, v3 */
- im->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ im->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
igmp_ifc_event(in_dev);
#endif
}
@@ -1314,6 +1317,9 @@ static void ip_mc_hash_remove(struct in_device *in_dev,
void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
{
struct ip_mc_list *im;
+#ifdef CONFIG_IP_MULTICAST
+ struct net *net = dev_net(in_dev->dev);
+#endif
ASSERT_RTNL();
@@ -1340,7 +1346,7 @@ void ip_mc_inc_group(struct in_device *in_dev, __be32 addr)
spin_lock_init(&im->lock);
#ifdef CONFIG_IP_MULTICAST
setup_timer(&im->timer, igmp_timer_expire, (unsigned long)im);
- im->unsolicit_count = sysctl_igmp_qrv;
+ im->unsolicit_count = net->ipv4.sysctl_igmp_qrv;
#endif
im->next_rcu = in_dev->mc_list;
@@ -1533,6 +1539,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)
#ifdef CONFIG_IP_MULTICAST
struct ip_mc_list *im;
int type;
+ struct net *net = dev_net(in_dev->dev);
ASSERT_RTNL();
@@ -1540,7 +1547,7 @@ static void ip_mc_rejoin_groups(struct in_device *in_dev)
if (im->multiaddr == IGMP_ALL_HOSTS)
continue;
if (ipv4_is_local_multicast(im->multiaddr) &&
- !sysctl_igmp_llm_reports)
+ !net->ipv4.sysctl_igmp_llm_reports)
continue;
/* a failover is happening and switches
@@ -1639,6 +1646,9 @@ void ip_mc_down(struct in_device *in_dev)
void ip_mc_init_dev(struct in_device *in_dev)
{
+#ifdef CONFIG_IP_MULTICAST
+ struct net *net = dev_net(in_dev->dev);
+#endif
ASSERT_RTNL();
#ifdef CONFIG_IP_MULTICAST
@@ -1646,7 +1656,7 @@ void ip_mc_init_dev(struct in_device *in_dev)
(unsigned long)in_dev);
setup_timer(&in_dev->mr_ifc_timer, igmp_ifc_timer_expire,
(unsigned long)in_dev);
- in_dev->mr_qrv = sysctl_igmp_qrv;
+ in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
#endif
spin_lock_init(&in_dev->mc_tomb_lock);
@@ -1657,11 +1667,14 @@ void ip_mc_init_dev(struct in_device *in_dev)
void ip_mc_up(struct in_device *in_dev)
{
struct ip_mc_list *pmc;
+#ifdef CONFIG_IP_MULTICAST
+ struct net *net = dev_net(in_dev->dev);
+#endif
ASSERT_RTNL();
#ifdef CONFIG_IP_MULTICAST
- in_dev->mr_qrv = sysctl_igmp_qrv;
+ in_dev->mr_qrv = net->ipv4.sysctl_igmp_qrv;
#endif
ip_mc_inc_group(in_dev, IGMP_ALL_HOSTS);
@@ -1727,11 +1740,6 @@ static struct in_device *ip_mc_find_dev(struct net *net, struct ip_mreqn *imr)
/*
* Join a socket to a group
*/
-int sysctl_igmp_max_memberships __read_mostly = IP_MAX_MEMBERSHIPS;
-int sysctl_igmp_max_msf __read_mostly = IP_MAX_MSF;
-#ifdef CONFIG_IP_MULTICAST
-int sysctl_igmp_qrv __read_mostly = IGMP_QUERY_ROBUSTNESS_VARIABLE;
-#endif
static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
__be32 *psfsrc)
@@ -1756,6 +1764,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
if (!psf->sf_count[MCAST_INCLUDE] && !psf->sf_count[MCAST_EXCLUDE]) {
#ifdef CONFIG_IP_MULTICAST
struct in_device *in_dev = pmc->interface;
+ struct net *net = dev_net(in_dev->dev);
#endif
/* no more filters for this source */
@@ -1766,7 +1775,7 @@ static int ip_mc_del1_src(struct ip_mc_list *pmc, int sfmode,
#ifdef CONFIG_IP_MULTICAST
if (psf->sf_oldin &&
!IGMP_V1_SEEN(in_dev) && !IGMP_V2_SEEN(in_dev)) {
- psf->sf_crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ psf->sf_crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
psf->sf_next = pmc->tomb;
pmc->tomb = psf;
rv = 1;
@@ -1824,12 +1833,13 @@ static int ip_mc_del_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
pmc->sfcount[MCAST_INCLUDE]) {
#ifdef CONFIG_IP_MULTICAST
struct ip_sf_list *psf;
+ struct net *net = dev_net(in_dev->dev);
#endif
/* filter mode change */
pmc->sfmode = MCAST_INCLUDE;
#ifdef CONFIG_IP_MULTICAST
- pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
in_dev->mr_ifc_count = pmc->crcount;
for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
@@ -1996,6 +2006,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
} else if (isexclude != (pmc->sfcount[MCAST_EXCLUDE] != 0)) {
#ifdef CONFIG_IP_MULTICAST
struct ip_sf_list *psf;
+ struct net *net = dev_net(pmc->interface->dev);
in_dev = pmc->interface;
#endif
@@ -2007,7 +2018,7 @@ static int ip_mc_add_src(struct in_device *in_dev, __be32 *pmca, int sfmode,
#ifdef CONFIG_IP_MULTICAST
/* else no filters; keep old mode for reports */
- pmc->crcount = in_dev->mr_qrv ?: sysctl_igmp_qrv;
+ pmc->crcount = in_dev->mr_qrv ?: net->ipv4.sysctl_igmp_qrv;
in_dev->mr_ifc_count = pmc->crcount;
for (psf = pmc->sources; psf; psf = psf->sf_next)
psf->sf_crcount = 0;
@@ -2074,7 +2085,7 @@ int ip_mc_join_group(struct sock *sk, struct ip_mreqn *imr)
count++;
}
err = -ENOBUFS;
- if (count >= sysctl_igmp_max_memberships)
+ if (count >= net->ipv4.sysctl_igmp_max_memberships)
goto done;
iml = sock_kmalloc(sk, sizeof(*iml), GFP_KERNEL);
if (!iml)
@@ -2126,7 +2137,7 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
ASSERT_RTNL();
in_dev = ip_mc_find_dev(net, imr);
- if (!in_dev) {
+ if (!imr->imr_ifindex && !imr->imr_address.s_addr && !in_dev) {
ret = -ENODEV;
goto out;
}
@@ -2147,7 +2158,8 @@ int ip_mc_leave_group(struct sock *sk, struct ip_mreqn *imr)
*imlp = iml->next_rcu;
- ip_mc_dec_group(in_dev, group);
+ if (in_dev)
+ ip_mc_dec_group(in_dev, group);
/* decrease mem now to avoid the memleak warning */
atomic_sub(sizeof(*iml), &sk->sk_omem_alloc);
@@ -2245,7 +2257,7 @@ int ip_mc_source(int add, int omode, struct sock *sk, struct
}
/* else, add a new source to the filter */
- if (psl && psl->sl_count >= sysctl_igmp_max_msf) {
+ if (psl && psl->sl_count >= net->ipv4.sysctl_igmp_max_msf) {
err = -ENOBUFS;
goto done;
}
@@ -2392,11 +2404,11 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
struct ip_sf_socklist *psl;
struct net *net = sock_net(sk);
+ ASSERT_RTNL();
+
if (!ipv4_is_multicast(addr))
return -EINVAL;
- rtnl_lock();
-
imr.imr_multiaddr.s_addr = msf->imsf_multiaddr;
imr.imr_address.s_addr = msf->imsf_interface;
imr.imr_ifindex = 0;
@@ -2417,7 +2429,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
goto done;
msf->imsf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
- rtnl_unlock();
if (!psl) {
len = 0;
count = 0;
@@ -2436,7 +2447,6 @@ int ip_mc_msfget(struct sock *sk, struct ip_msfilter *msf,
return -EFAULT;
return 0;
done:
- rtnl_unlock();
return err;
}
@@ -2450,6 +2460,8 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
struct inet_sock *inet = inet_sk(sk);
struct ip_sf_socklist *psl;
+ ASSERT_RTNL();
+
psin = (struct sockaddr_in *)&gsf->gf_group;
if (psin->sin_family != AF_INET)
return -EINVAL;
@@ -2457,8 +2469,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
if (!ipv4_is_multicast(addr))
return -EINVAL;
- rtnl_lock();
-
err = -EADDRNOTAVAIL;
for_each_pmc_rtnl(inet, pmc) {
@@ -2470,7 +2480,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
goto done;
gsf->gf_fmode = pmc->sfmode;
psl = rtnl_dereference(pmc->sflist);
- rtnl_unlock();
count = psl ? psl->sl_count : 0;
copycount = count < gsf->gf_numsrc ? count : gsf->gf_numsrc;
gsf->gf_numsrc = count;
@@ -2490,7 +2499,6 @@ int ip_mc_gsfget(struct sock *sk, struct group_filter *gsf,
}
return 0;
done:
- rtnl_unlock();
return err;
}
@@ -2922,6 +2930,12 @@ static int __net_init igmp_net_init(struct net *net)
goto out_sock;
}
+ /* Sysctl initialization */
+ net->ipv4.sysctl_igmp_max_memberships = 20;
+ net->ipv4.sysctl_igmp_max_msf = 10;
+ /* IGMP reports for link-local multicast groups are enabled by default */
+ net->ipv4.sysctl_igmp_llm_reports = 1;
+ net->ipv4.sysctl_igmp_qrv = 2;
return 0;
out_sock:
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 1feb15f23de8..bc5196ea1bdf 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -24,6 +24,7 @@
#include <net/tcp_states.h>
#include <net/xfrm.h>
#include <net/tcp.h>
+#include <net/sock_reuseport.h>
#ifdef INET_CSK_DEBUG
const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -67,7 +68,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
if ((!reuse || !sk2->sk_reuse ||
sk2->sk_state == TCP_LISTEN) &&
(!reuseport || !sk2->sk_reuseport ||
- (sk2->sk_state != TCP_TIME_WAIT &&
+ rcu_access_pointer(sk->sk_reuseport_cb) ||
+ (sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid, sock_i_uid(sk2))))) {
if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
@@ -89,161 +91,154 @@ EXPORT_SYMBOL_GPL(inet_csk_bind_conflict);
/* Obtain a reference to a local port for the given sock,
* if snum is zero it means select any available local port.
+ * We try to allocate an odd port (and leave even ports for connect())
*/
int inet_csk_get_port(struct sock *sk, unsigned short snum)
{
- struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
+ bool reuse = sk->sk_reuse && sk->sk_state != TCP_LISTEN;
+ struct inet_hashinfo *hinfo = sk->sk_prot->h.hashinfo;
+ int ret = 1, attempts = 5, port = snum;
+ int smallest_size = -1, smallest_port;
struct inet_bind_hashbucket *head;
- struct inet_bind_bucket *tb;
- int ret, attempts = 5;
struct net *net = sock_net(sk);
- int smallest_size = -1, smallest_rover;
+ int i, low, high, attempt_half;
+ struct inet_bind_bucket *tb;
kuid_t uid = sock_i_uid(sk);
- int attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
+ u32 remaining, offset;
- local_bh_disable();
- if (!snum) {
- int remaining, rover, low, high;
+ if (port) {
+have_port:
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock_bh(&head->lock);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (net_eq(ib_net(tb), net) && tb->port == port)
+ goto tb_found;
+ goto tb_not_found;
+ }
again:
- inet_get_local_port_range(net, &low, &high);
- if (attempt_half) {
- int half = low + ((high - low) >> 1);
-
- if (attempt_half == 1)
- high = half;
- else
- low = half;
- }
- remaining = (high - low) + 1;
- smallest_rover = rover = prandom_u32() % remaining + low;
-
- smallest_size = -1;
- do {
- if (inet_is_local_reserved_port(net, rover))
- goto next_nolock;
- head = &hashinfo->bhash[inet_bhashfn(net, rover,
- hashinfo->bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == rover) {
- if (((tb->fastreuse > 0 &&
- sk->sk_reuse &&
- sk->sk_state != TCP_LISTEN) ||
- (tb->fastreuseport > 0 &&
- sk->sk_reuseport &&
- uid_eq(tb->fastuid, uid))) &&
- (tb->num_owners < smallest_size || smallest_size == -1)) {
- smallest_size = tb->num_owners;
- smallest_rover = rover;
- }
- if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false)) {
- snum = rover;
- goto tb_found;
- }
- goto next;
+ attempt_half = (sk->sk_reuse == SK_CAN_REUSE) ? 1 : 0;
+other_half_scan:
+ inet_get_local_port_range(net, &low, &high);
+ high++; /* [32768, 60999] -> [32768, 61000[ */
+ if (high - low < 4)
+ attempt_half = 0;
+ if (attempt_half) {
+ int half = low + (((high - low) >> 2) << 1);
+
+ if (attempt_half == 1)
+ high = half;
+ else
+ low = half;
+ }
+ remaining = high - low;
+ if (likely(remaining > 1))
+ remaining &= ~1U;
+
+ offset = prandom_u32() % remaining;
+ /* __inet_hash_connect() favors ports having @low parity
+ * We do the opposite to not pollute connect() users.
+ */
+ offset |= 1U;
+ smallest_size = -1;
+ smallest_port = low; /* avoid compiler warning */
+
+other_parity_scan:
+ port = low + offset;
+ for (i = 0; i < remaining; i += 2, port += 2) {
+ if (unlikely(port >= high))
+ port -= remaining;
+ if (inet_is_local_reserved_port(net, port))
+ continue;
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock_bh(&head->lock);
+ inet_bind_bucket_for_each(tb, &head->chain)
+ if (net_eq(ib_net(tb), net) && tb->port == port) {
+ if (((tb->fastreuse > 0 && reuse) ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ uid_eq(tb->fastuid, uid))) &&
+ (tb->num_owners < smallest_size || smallest_size == -1)) {
+ smallest_size = tb->num_owners;
+ smallest_port = port;
}
- break;
- next:
- spin_unlock(&head->lock);
- next_nolock:
- if (++rover > high)
- rover = low;
- } while (--remaining > 0);
-
- /* Exhausted local port range during search? It is not
- * possible for us to be holding one of the bind hash
- * locks if this test triggers, because if 'remaining'
- * drops to zero, we broke out of the do/while loop at
- * the top level, not from the 'break;' statement.
- */
- ret = 1;
- if (remaining <= 0) {
- if (smallest_size != -1) {
- snum = smallest_rover;
- goto have_snum;
- }
- if (attempt_half == 1) {
- /* OK we now try the upper half of the range */
- attempt_half = 2;
- goto again;
+ if (!inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, false))
+ goto tb_found;
+ goto next_port;
}
- goto fail;
- }
- /* OK, here is the one we will use. HEAD is
- * non-NULL and we hold it's mutex.
- */
- snum = rover;
- } else {
-have_snum:
- head = &hashinfo->bhash[inet_bhashfn(net, snum,
- hashinfo->bhash_size)];
- spin_lock(&head->lock);
- inet_bind_bucket_for_each(tb, &head->chain)
- if (net_eq(ib_net(tb), net) && tb->port == snum)
- goto tb_found;
+ goto tb_not_found;
+next_port:
+ spin_unlock_bh(&head->lock);
+ cond_resched();
+ }
+
+ if (smallest_size != -1) {
+ port = smallest_port;
+ goto have_port;
+ }
+ offset--;
+ if (!(offset & 1))
+ goto other_parity_scan;
+
+ if (attempt_half == 1) {
+ /* OK we now try the upper half of the range */
+ attempt_half = 2;
+ goto other_half_scan;
}
- tb = NULL;
- goto tb_not_found;
+ return ret;
+
+tb_not_found:
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+ net, head, port);
+ if (!tb)
+ goto fail_unlock;
tb_found:
if (!hlist_empty(&tb->owners)) {
if (sk->sk_reuse == SK_FORCE_REUSE)
goto success;
- if (((tb->fastreuse > 0 &&
- sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
+ if (((tb->fastreuse > 0 && reuse) ||
(tb->fastreuseport > 0 &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
- smallest_size == -1) {
+ smallest_size == -1)
goto success;
- } else {
- ret = 1;
- if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
- if (((sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
- (tb->fastreuseport > 0 &&
- sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
- smallest_size != -1 && --attempts >= 0) {
- spin_unlock(&head->lock);
- goto again;
- }
-
- goto fail_unlock;
+ if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, true)) {
+ if ((reuse ||
+ (tb->fastreuseport > 0 &&
+ sk->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ uid_eq(tb->fastuid, uid))) &&
+ smallest_size != -1 && --attempts >= 0) {
+ spin_unlock_bh(&head->lock);
+ goto again;
}
+ goto fail_unlock;
}
- }
-tb_not_found:
- ret = 1;
- if (!tb && (tb = inet_bind_bucket_create(hashinfo->bind_bucket_cachep,
- net, head, snum)) == NULL)
- goto fail_unlock;
- if (hlist_empty(&tb->owners)) {
- if (sk->sk_reuse && sk->sk_state != TCP_LISTEN)
- tb->fastreuse = 1;
- else
+ if (!reuse)
tb->fastreuse = 0;
+ if (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid))
+ tb->fastreuseport = 0;
+ } else {
+ tb->fastreuse = reuse;
if (sk->sk_reuseport) {
tb->fastreuseport = 1;
tb->fastuid = uid;
- } else
- tb->fastreuseport = 0;
- } else {
- if (tb->fastreuse &&
- (!sk->sk_reuse || sk->sk_state == TCP_LISTEN))
- tb->fastreuse = 0;
- if (tb->fastreuseport &&
- (!sk->sk_reuseport || !uid_eq(tb->fastuid, uid)))
+ } else {
tb->fastreuseport = 0;
+ }
}
success:
if (!inet_csk(sk)->icsk_bind_hash)
- inet_bind_hash(sk, tb, snum);
+ inet_bind_hash(sk, tb, port);
WARN_ON(inet_csk(sk)->icsk_bind_hash != tb);
ret = 0;
fail_unlock:
- spin_unlock(&head->lock);
-fail:
- local_bh_enable();
+ spin_unlock_bh(&head->lock);
return ret;
}
EXPORT_SYMBOL_GPL(inet_csk_get_port);
@@ -482,10 +477,6 @@ EXPORT_SYMBOL_GPL(inet_csk_route_child_sock);
#define AF_INET_FAMILY(fam) true
#endif
-/* Only thing we need from tcp.h */
-extern int sysctl_tcp_synack_retries;
-
-
/* Decide when to expire the request and when to resend SYN-ACK */
static inline void syn_ack_recalc(struct request_sock *req, const int thresh,
const int max_retries,
@@ -557,16 +548,17 @@ static void reqsk_timer_handler(unsigned long data)
{
struct request_sock *req = (struct request_sock *)data;
struct sock *sk_listener = req->rsk_listener;
+ struct net *net = sock_net(sk_listener);
struct inet_connection_sock *icsk = inet_csk(sk_listener);
struct request_sock_queue *queue = &icsk->icsk_accept_queue;
int qlen, expire = 0, resend = 0;
int max_retries, thresh;
u8 defer_accept;
- if (sk_listener->sk_state != TCP_LISTEN)
+ if (sk_state_load(sk_listener) != TCP_LISTEN)
goto drop;
- max_retries = icsk->icsk_syn_retries ? : sysctl_tcp_synack_retries;
+ max_retries = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_synack_retries;
thresh = max_retries;
/* Normally all the openreqs are young and become mature
* (i.e. converted to established socket) for first timeout.
@@ -737,6 +729,7 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct inet_sock *inet = inet_sk(sk);
+ int err = -EADDRINUSE;
reqsk_queue_alloc(&icsk->icsk_accept_queue);
@@ -749,18 +742,19 @@ int inet_csk_listen_start(struct sock *sk, int backlog)
* It is OK, because this socket enters to hash table only
* after validation is complete.
*/
- sk->sk_state = TCP_LISTEN;
+ sk_state_store(sk, TCP_LISTEN);
if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
inet->inet_sport = htons(inet->inet_num);
sk_dst_reset(sk);
- sk->sk_prot->hash(sk);
+ err = sk->sk_prot->hash(sk);
- return 0;
+ if (likely(!err))
+ return 0;
}
sk->sk_state = TCP_CLOSE;
- return -EADDRINUSE;
+ return err;
}
EXPORT_SYMBOL_GPL(inet_csk_listen_start);
@@ -789,14 +783,16 @@ static void inet_child_forget(struct sock *sk, struct request_sock *req,
reqsk_put(req);
}
-void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req,
- struct sock *child)
+struct sock *inet_csk_reqsk_queue_add(struct sock *sk,
+ struct request_sock *req,
+ struct sock *child)
{
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
spin_lock(&queue->rskq_lock);
if (unlikely(sk->sk_state != TCP_LISTEN)) {
inet_child_forget(sk, req, child);
+ child = NULL;
} else {
req->sk = child;
req->dl_next = NULL;
@@ -808,6 +804,7 @@ void inet_csk_reqsk_queue_add(struct sock *sk, struct request_sock *req,
sk_acceptq_added(sk);
}
spin_unlock(&queue->rskq_lock);
+ return child;
}
EXPORT_SYMBOL(inet_csk_reqsk_queue_add);
@@ -817,11 +814,8 @@ struct sock *inet_csk_complete_hashdance(struct sock *sk, struct sock *child,
if (own_req) {
inet_csk_reqsk_queue_drop(sk, req);
reqsk_queue_removed(&inet_csk(sk)->icsk_accept_queue, req);
- inet_csk_reqsk_queue_add(sk, req, child);
- /* Warning: caller must not call reqsk_put(req);
- * child stole last reference on it.
- */
- return child;
+ if (inet_csk_reqsk_queue_add(sk, req, child))
+ return child;
}
/* Too bad, another child took ownership of the request, undo. */
bh_unlock_sock(child);
diff --git a/net/ipv4/inet_diag.c b/net/ipv4/inet_diag.c
index ab9f8a66615d..5fdb02f5598e 100644
--- a/net/ipv4/inet_diag.c
+++ b/net/ipv4/inet_diag.c
@@ -350,40 +350,60 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
nlmsg_flags, unlh);
}
-int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
- struct sk_buff *in_skb,
- const struct nlmsghdr *nlh,
- const struct inet_diag_req_v2 *req)
+struct sock *inet_diag_find_one_icsk(struct net *net,
+ struct inet_hashinfo *hashinfo,
+ const struct inet_diag_req_v2 *req)
{
- struct net *net = sock_net(in_skb->sk);
- struct sk_buff *rep;
struct sock *sk;
- int err;
- err = -EINVAL;
if (req->sdiag_family == AF_INET)
- sk = inet_lookup(net, hashinfo, req->id.idiag_dst[0],
+ sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[0],
req->id.idiag_dport, req->id.idiag_src[0],
req->id.idiag_sport, req->id.idiag_if);
#if IS_ENABLED(CONFIG_IPV6)
- else if (req->sdiag_family == AF_INET6)
- sk = inet6_lookup(net, hashinfo,
- (struct in6_addr *)req->id.idiag_dst,
- req->id.idiag_dport,
- (struct in6_addr *)req->id.idiag_src,
- req->id.idiag_sport,
- req->id.idiag_if);
+ else if (req->sdiag_family == AF_INET6) {
+ if (ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_dst) &&
+ ipv6_addr_v4mapped((struct in6_addr *)req->id.idiag_src))
+ sk = inet_lookup(net, hashinfo, NULL, 0, req->id.idiag_dst[3],
+ req->id.idiag_dport, req->id.idiag_src[3],
+ req->id.idiag_sport, req->id.idiag_if);
+ else
+ sk = inet6_lookup(net, hashinfo, NULL, 0,
+ (struct in6_addr *)req->id.idiag_dst,
+ req->id.idiag_dport,
+ (struct in6_addr *)req->id.idiag_src,
+ req->id.idiag_sport,
+ req->id.idiag_if);
+ }
#endif
else
- goto out_nosk;
+ return ERR_PTR(-EINVAL);
- err = -ENOENT;
if (!sk)
- goto out_nosk;
+ return ERR_PTR(-ENOENT);
- err = sock_diag_check_cookie(sk, req->id.idiag_cookie);
- if (err)
- goto out;
+ if (sock_diag_check_cookie(sk, req->id.idiag_cookie)) {
+ sock_gen_put(sk);
+ return ERR_PTR(-ENOENT);
+ }
+
+ return sk;
+}
+EXPORT_SYMBOL_GPL(inet_diag_find_one_icsk);
+
+int inet_diag_dump_one_icsk(struct inet_hashinfo *hashinfo,
+ struct sk_buff *in_skb,
+ const struct nlmsghdr *nlh,
+ const struct inet_diag_req_v2 *req)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sk_buff *rep;
+ struct sock *sk;
+ int err;
+
+ sk = inet_diag_find_one_icsk(net, hashinfo, req);
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
rep = nlmsg_new(inet_sk_attr_size(), GFP_KERNEL);
if (!rep) {
@@ -409,12 +429,11 @@ out:
if (sk)
sock_gen_put(sk);
-out_nosk:
return err;
}
EXPORT_SYMBOL_GPL(inet_diag_dump_one_icsk);
-static int inet_diag_get_exact(struct sk_buff *in_skb,
+static int inet_diag_cmd_exact(int cmd, struct sk_buff *in_skb,
const struct nlmsghdr *nlh,
const struct inet_diag_req_v2 *req)
{
@@ -424,8 +443,12 @@ static int inet_diag_get_exact(struct sk_buff *in_skb,
handler = inet_diag_lock_handler(req->sdiag_protocol);
if (IS_ERR(handler))
err = PTR_ERR(handler);
- else
+ else if (cmd == SOCK_DIAG_BY_FAMILY)
err = handler->dump_one(in_skb, nlh, req);
+ else if (cmd == SOCK_DESTROY && handler->destroy)
+ err = handler->destroy(in_skb, req);
+ else
+ err = -EOPNOTSUPP;
inet_diag_unlock_handler(handler);
return err;
@@ -856,6 +879,7 @@ next_normal:
}
spin_unlock_bh(lock);
+ cond_resched();
}
done:
@@ -938,7 +962,7 @@ static int inet_diag_get_exact_compat(struct sk_buff *in_skb,
req.idiag_states = rc->idiag_states;
req.id = rc->id;
- return inet_diag_get_exact(in_skb, nlh, &req);
+ return inet_diag_cmd_exact(SOCK_DIAG_BY_FAMILY, in_skb, nlh, &req);
}
static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
@@ -972,7 +996,7 @@ static int inet_diag_rcv_msg_compat(struct sk_buff *skb, struct nlmsghdr *nlh)
return inet_diag_get_exact_compat(skb, nlh);
}
-static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
+static int inet_diag_handler_cmd(struct sk_buff *skb, struct nlmsghdr *h)
{
int hdrlen = sizeof(struct inet_diag_req_v2);
struct net *net = sock_net(skb->sk);
@@ -980,7 +1004,8 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
if (nlmsg_len(h) < hdrlen)
return -EINVAL;
- if (h->nlmsg_flags & NLM_F_DUMP) {
+ if (h->nlmsg_type == SOCK_DIAG_BY_FAMILY &&
+ h->nlmsg_flags & NLM_F_DUMP) {
if (nlmsg_attrlen(h, hdrlen)) {
struct nlattr *attr;
@@ -999,7 +1024,7 @@ static int inet_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
}
}
- return inet_diag_get_exact(skb, h, nlmsg_data(h));
+ return inet_diag_cmd_exact(h->nlmsg_type, skb, h, nlmsg_data(h));
}
static
@@ -1050,14 +1075,16 @@ int inet_diag_handler_get_info(struct sk_buff *skb, struct sock *sk)
static const struct sock_diag_handler inet_diag_handler = {
.family = AF_INET,
- .dump = inet_diag_handler_dump,
+ .dump = inet_diag_handler_cmd,
.get_info = inet_diag_handler_get_info,
+ .destroy = inet_diag_handler_cmd,
};
static const struct sock_diag_handler inet6_diag_handler = {
.family = AF_INET6,
- .dump = inet_diag_handler_dump,
+ .dump = inet_diag_handler_cmd,
.get_info = inet_diag_handler_get_info,
+ .destroy = inet_diag_handler_cmd,
};
int inet_diag_register(const struct inet_diag_handler *h)
diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c
index fe144dae7372..3a88b0c73797 100644
--- a/net/ipv4/inet_fragment.c
+++ b/net/ipv4/inet_fragment.c
@@ -285,14 +285,6 @@ void inet_frag_kill(struct inet_frag_queue *fq, struct inet_frags *f)
}
EXPORT_SYMBOL(inet_frag_kill);
-static inline void frag_kfree_skb(struct netns_frags *nf, struct inet_frags *f,
- struct sk_buff *skb)
-{
- if (f->skb_free)
- f->skb_free(skb);
- kfree_skb(skb);
-}
-
void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
{
struct sk_buff *fp;
@@ -309,7 +301,7 @@ void inet_frag_destroy(struct inet_frag_queue *q, struct inet_frags *f)
struct sk_buff *xp = fp->next;
sum_truesize += fp->truesize;
- frag_kfree_skb(nf, f, fp);
+ kfree_skb(fp);
fp = xp;
}
sum = sum_truesize + f->qsize;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index ccc5980797fc..bc68eced0105 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -20,10 +20,12 @@
#include <linux/wait.h>
#include <linux/vmalloc.h>
+#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/secure_seq.h>
#include <net/ip.h>
+#include <net/sock_reuseport.h>
static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
const __u16 lport, const __be32 faddr,
@@ -205,6 +207,7 @@ static inline int compute_score(struct sock *sk, struct net *net,
struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+ struct sk_buff *skb, int doff,
const __be32 saddr, __be16 sport,
const __be32 daddr, const unsigned short hnum,
const int dif)
@@ -214,6 +217,7 @@ struct sock *__inet_lookup_listener(struct net *net,
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
int score, hiscore, matches = 0, reuseport = 0;
+ bool select_ok = true;
u32 phash = 0;
rcu_read_lock();
@@ -229,6 +233,15 @@ begin:
if (reuseport) {
phash = inet_ehashfn(net, daddr, hnum,
saddr, sport);
+ if (select_ok) {
+ struct sock *sk2;
+ sk2 = reuseport_select_sock(sk, phash,
+ skb, doff);
+ if (sk2) {
+ result = sk2;
+ goto found;
+ }
+ }
matches = 1;
}
} else if (score == hiscore && reuseport) {
@@ -246,11 +259,13 @@ begin:
if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
goto begin;
if (result) {
+found:
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
result = NULL;
else if (unlikely(compute_score(result, net, hnum, daddr,
dif) < hiscore)) {
sock_put(result);
+ select_ok = false;
goto begin;
}
}
@@ -449,32 +464,74 @@ bool inet_ehash_nolisten(struct sock *sk, struct sock *osk)
}
EXPORT_SYMBOL_GPL(inet_ehash_nolisten);
-void __inet_hash(struct sock *sk, struct sock *osk)
+static int inet_reuseport_add_sock(struct sock *sk,
+ struct inet_listen_hashbucket *ilb,
+ int (*saddr_same)(const struct sock *sk1,
+ const struct sock *sk2,
+ bool match_wildcard))
+{
+ struct sock *sk2;
+ struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
+
+ sk_nulls_for_each_rcu(sk2, node, &ilb->head) {
+ if (sk2 != sk &&
+ sk2->sk_family == sk->sk_family &&
+ ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
+ sk2->sk_bound_dev_if == sk->sk_bound_dev_if &&
+ sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+ saddr_same(sk, sk2, false))
+ return reuseport_add_sock(sk, sk2);
+ }
+
+ /* Initial allocation may have already happened via setsockopt */
+ if (!rcu_access_pointer(sk->sk_reuseport_cb))
+ return reuseport_alloc(sk);
+ return 0;
+}
+
+int __inet_hash(struct sock *sk, struct sock *osk,
+ int (*saddr_same)(const struct sock *sk1,
+ const struct sock *sk2,
+ bool match_wildcard))
{
struct inet_hashinfo *hashinfo = sk->sk_prot->h.hashinfo;
struct inet_listen_hashbucket *ilb;
+ int err = 0;
if (sk->sk_state != TCP_LISTEN) {
inet_ehash_nolisten(sk, osk);
- return;
+ return 0;
}
WARN_ON(!sk_unhashed(sk));
ilb = &hashinfo->listening_hash[inet_sk_listen_hashfn(sk)];
spin_lock(&ilb->lock);
+ if (sk->sk_reuseport) {
+ err = inet_reuseport_add_sock(sk, ilb, saddr_same);
+ if (err)
+ goto unlock;
+ }
__sk_nulls_add_node_rcu(sk, &ilb->head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
+unlock:
spin_unlock(&ilb->lock);
+
+ return err;
}
EXPORT_SYMBOL(__inet_hash);
-void inet_hash(struct sock *sk)
+int inet_hash(struct sock *sk)
{
+ int err = 0;
+
if (sk->sk_state != TCP_CLOSE) {
local_bh_disable();
- __inet_hash(sk, NULL);
+ err = __inet_hash(sk, NULL, ipv4_rcv_saddr_equal);
local_bh_enable();
}
+
+ return err;
}
EXPORT_SYMBOL_GPL(inet_hash);
@@ -493,6 +550,8 @@ void inet_unhash(struct sock *sk)
lock = inet_ehash_lockp(hashinfo, sk->sk_hash);
spin_lock_bh(lock);
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
done = __sk_nulls_del_node_init_rcu(sk);
if (done)
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, -1);
@@ -506,106 +565,106 @@ int __inet_hash_connect(struct inet_timewait_death_row *death_row,
struct sock *, __u16, struct inet_timewait_sock **))
{
struct inet_hashinfo *hinfo = death_row->hashinfo;
- const unsigned short snum = inet_sk(sk)->inet_num;
+ struct inet_timewait_sock *tw = NULL;
struct inet_bind_hashbucket *head;
- struct inet_bind_bucket *tb;
- int ret;
+ int port = inet_sk(sk)->inet_num;
struct net *net = sock_net(sk);
+ struct inet_bind_bucket *tb;
+ u32 remaining, offset;
+ int ret, i, low, high;
+ static u32 hint;
+
+ if (port) {
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ tb = inet_csk(sk)->icsk_bind_hash;
+ spin_lock_bh(&head->lock);
+ if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
+ inet_ehash_nolisten(sk, NULL);
+ spin_unlock_bh(&head->lock);
+ return 0;
+ }
+ spin_unlock(&head->lock);
+ /* No definite answer... Walk to established hash table */
+ ret = check_established(death_row, sk, port, NULL);
+ local_bh_enable();
+ return ret;
+ }
- if (!snum) {
- int i, remaining, low, high, port;
- static u32 hint;
- u32 offset = hint + port_offset;
- struct inet_timewait_sock *tw = NULL;
+ inet_get_local_port_range(net, &low, &high);
+ high++; /* [32768, 60999] -> [32768, 61000[ */
+ remaining = high - low;
+ if (likely(remaining > 1))
+ remaining &= ~1U;
- inet_get_local_port_range(net, &low, &high);
- remaining = (high - low) + 1;
+ offset = (hint + port_offset) % remaining;
+ /* In first pass we try ports of @low parity.
+ * inet_csk_get_port() does the opposite choice.
+ */
+ offset &= ~1U;
+other_parity_scan:
+ port = low + offset;
+ for (i = 0; i < remaining; i += 2, port += 2) {
+ if (unlikely(port >= high))
+ port -= remaining;
+ if (inet_is_local_reserved_port(net, port))
+ continue;
+ head = &hinfo->bhash[inet_bhashfn(net, port,
+ hinfo->bhash_size)];
+ spin_lock_bh(&head->lock);
- /* By starting with offset being an even number,
- * we tend to leave about 50% of ports for other uses,
- * like bind(0).
+ /* Does not bother with rcv_saddr checks, because
+ * the established check is already unique enough.
*/
- offset &= ~1;
-
- local_bh_disable();
- for (i = 0; i < remaining; i++) {
- port = low + (i + offset) % remaining;
- if (inet_is_local_reserved_port(net, port))
- continue;
- head = &hinfo->bhash[inet_bhashfn(net, port,
- hinfo->bhash_size)];
- spin_lock(&head->lock);
-
- /* Does not bother with rcv_saddr checks,
- * because the established check is already
- * unique enough.
- */
- inet_bind_bucket_for_each(tb, &head->chain) {
- if (net_eq(ib_net(tb), net) &&
- tb->port == port) {
- if (tb->fastreuse >= 0 ||
- tb->fastreuseport >= 0)
- goto next_port;
- WARN_ON(hlist_empty(&tb->owners));
- if (!check_established(death_row, sk,
- port, &tw))
- goto ok;
+ inet_bind_bucket_for_each(tb, &head->chain) {
+ if (net_eq(ib_net(tb), net) && tb->port == port) {
+ if (tb->fastreuse >= 0 ||
+ tb->fastreuseport >= 0)
goto next_port;
- }
+ WARN_ON(hlist_empty(&tb->owners));
+ if (!check_established(death_row, sk,
+ port, &tw))
+ goto ok;
+ goto next_port;
}
-
- tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
- net, head, port);
- if (!tb) {
- spin_unlock(&head->lock);
- break;
- }
- tb->fastreuse = -1;
- tb->fastreuseport = -1;
- goto ok;
-
- next_port:
- spin_unlock(&head->lock);
}
- local_bh_enable();
-
- return -EADDRNOTAVAIL;
-ok:
- hint += (i + 2) & ~1;
-
- /* Head lock still held and bh's disabled */
- inet_bind_hash(sk, tb, port);
- if (sk_unhashed(sk)) {
- inet_sk(sk)->inet_sport = htons(port);
- inet_ehash_nolisten(sk, (struct sock *)tw);
+ tb = inet_bind_bucket_create(hinfo->bind_bucket_cachep,
+ net, head, port);
+ if (!tb) {
+ spin_unlock_bh(&head->lock);
+ return -ENOMEM;
}
- if (tw)
- inet_twsk_bind_unhash(tw, hinfo);
- spin_unlock(&head->lock);
+ tb->fastreuse = -1;
+ tb->fastreuseport = -1;
+ goto ok;
+next_port:
+ spin_unlock_bh(&head->lock);
+ cond_resched();
+ }
- if (tw)
- inet_twsk_deschedule_put(tw);
+ offset++;
+ if ((offset & 1) && remaining > 1)
+ goto other_parity_scan;
- ret = 0;
- goto out;
- }
+ return -EADDRNOTAVAIL;
- head = &hinfo->bhash[inet_bhashfn(net, snum, hinfo->bhash_size)];
- tb = inet_csk(sk)->icsk_bind_hash;
- spin_lock_bh(&head->lock);
- if (sk_head(&tb->owners) == sk && !sk->sk_bind_node.next) {
- inet_ehash_nolisten(sk, NULL);
- spin_unlock_bh(&head->lock);
- return 0;
- } else {
- spin_unlock(&head->lock);
- /* No definite answer... Walk to established hash table */
- ret = check_established(death_row, sk, snum, NULL);
-out:
- local_bh_enable();
- return ret;
+ok:
+ hint += i + 2;
+
+ /* Head lock still held and bh's disabled */
+ inet_bind_hash(sk, tb, port);
+ if (sk_unhashed(sk)) {
+ inet_sk(sk)->inet_sport = htons(port);
+ inet_ehash_nolisten(sk, (struct sock *)tw);
}
+ if (tw)
+ inet_twsk_bind_unhash(tw, hinfo);
+ spin_unlock(&head->lock);
+ if (tw)
+ inet_twsk_deschedule_put(tw);
+ local_bh_enable();
+ return 0;
}
/*
diff --git a/net/ipv4/inet_lro.c b/net/ipv4/inet_lro.c
deleted file mode 100644
index f17ea49b28fb..000000000000
--- a/net/ipv4/inet_lro.c
+++ /dev/null
@@ -1,374 +0,0 @@
-/*
- * linux/net/ipv4/inet_lro.c
- *
- * Large Receive Offload (ipv4 / tcp)
- *
- * (C) Copyright IBM Corp. 2007
- *
- * Authors:
- * Jan-Bernd Themann <themann@de.ibm.com>
- * Christoph Raisch <raisch@de.ibm.com>
- *
- *
- * This program is free software; you can redistribute it and/or modify
- * it under the terms of the GNU General Public License as published by
- * the Free Software Foundation; either version 2, or (at your option)
- * any later version.
- *
- * This program is distributed in the hope that it will be useful,
- * but WITHOUT ANY WARRANTY; without even the implied warranty of
- * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
- * GNU General Public License for more details.
- *
- * You should have received a copy of the GNU General Public License
- * along with this program; if not, write to the Free Software
- * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA.
- */
-
-
-#include <linux/module.h>
-#include <linux/if_vlan.h>
-#include <linux/inet_lro.h>
-#include <net/checksum.h>
-
-MODULE_LICENSE("GPL");
-MODULE_AUTHOR("Jan-Bernd Themann <themann@de.ibm.com>");
-MODULE_DESCRIPTION("Large Receive Offload (ipv4 / tcp)");
-
-#define TCP_HDR_LEN(tcph) (tcph->doff << 2)
-#define IP_HDR_LEN(iph) (iph->ihl << 2)
-#define TCP_PAYLOAD_LENGTH(iph, tcph) \
- (ntohs(iph->tot_len) - IP_HDR_LEN(iph) - TCP_HDR_LEN(tcph))
-
-#define IPH_LEN_WO_OPTIONS 5
-#define TCPH_LEN_WO_OPTIONS 5
-#define TCPH_LEN_W_TIMESTAMP 8
-
-#define LRO_MAX_PG_HLEN 64
-
-#define LRO_INC_STATS(lro_mgr, attr) { lro_mgr->stats.attr++; }
-
-/*
- * Basic tcp checks whether packet is suitable for LRO
- */
-
-static int lro_tcp_ip_check(const struct iphdr *iph, const struct tcphdr *tcph,
- int len, const struct net_lro_desc *lro_desc)
-{
- /* check ip header: don't aggregate padded frames */
- if (ntohs(iph->tot_len) != len)
- return -1;
-
- if (TCP_PAYLOAD_LENGTH(iph, tcph) == 0)
- return -1;
-
- if (iph->ihl != IPH_LEN_WO_OPTIONS)
- return -1;
-
- if (tcph->cwr || tcph->ece || tcph->urg || !tcph->ack ||
- tcph->rst || tcph->syn || tcph->fin)
- return -1;
-
- if (INET_ECN_is_ce(ipv4_get_dsfield(iph)))
- return -1;
-
- if (tcph->doff != TCPH_LEN_WO_OPTIONS &&
- tcph->doff != TCPH_LEN_W_TIMESTAMP)
- return -1;
-
- /* check tcp options (only timestamp allowed) */
- if (tcph->doff == TCPH_LEN_W_TIMESTAMP) {
- __be32 *topt = (__be32 *)(tcph + 1);
-
- if (*topt != htonl((TCPOPT_NOP << 24) | (TCPOPT_NOP << 16)
- | (TCPOPT_TIMESTAMP << 8)
- | TCPOLEN_TIMESTAMP))
- return -1;
-
- /* timestamp should be in right order */
- topt++;
- if (lro_desc && after(ntohl(lro_desc->tcp_rcv_tsval),
- ntohl(*topt)))
- return -1;
-
- /* timestamp reply should not be zero */
- topt++;
- if (*topt == 0)
- return -1;
- }
-
- return 0;
-}
-
-static void lro_update_tcp_ip_header(struct net_lro_desc *lro_desc)
-{
- struct iphdr *iph = lro_desc->iph;
- struct tcphdr *tcph = lro_desc->tcph;
- __be32 *p;
- __wsum tcp_hdr_csum;
-
- tcph->ack_seq = lro_desc->tcp_ack;
- tcph->window = lro_desc->tcp_window;
-
- if (lro_desc->tcp_saw_tstamp) {
- p = (__be32 *)(tcph + 1);
- *(p+2) = lro_desc->tcp_rcv_tsecr;
- }
-
- csum_replace2(&iph->check, iph->tot_len, htons(lro_desc->ip_tot_len));
- iph->tot_len = htons(lro_desc->ip_tot_len);
-
- tcph->check = 0;
- tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), 0);
- lro_desc->data_csum = csum_add(lro_desc->data_csum, tcp_hdr_csum);
- tcph->check = csum_tcpudp_magic(iph->saddr, iph->daddr,
- lro_desc->ip_tot_len -
- IP_HDR_LEN(iph), IPPROTO_TCP,
- lro_desc->data_csum);
-}
-
-static __wsum lro_tcp_data_csum(struct iphdr *iph, struct tcphdr *tcph, int len)
-{
- __wsum tcp_csum;
- __wsum tcp_hdr_csum;
- __wsum tcp_ps_hdr_csum;
-
- tcp_csum = ~csum_unfold(tcph->check);
- tcp_hdr_csum = csum_partial(tcph, TCP_HDR_LEN(tcph), tcp_csum);
-
- tcp_ps_hdr_csum = csum_tcpudp_nofold(iph->saddr, iph->daddr,
- len + TCP_HDR_LEN(tcph),
- IPPROTO_TCP, 0);
-
- return csum_sub(csum_sub(tcp_csum, tcp_hdr_csum),
- tcp_ps_hdr_csum);
-}
-
-static void lro_init_desc(struct net_lro_desc *lro_desc, struct sk_buff *skb,
- struct iphdr *iph, struct tcphdr *tcph)
-{
- int nr_frags;
- __be32 *ptr;
- u32 tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
-
- nr_frags = skb_shinfo(skb)->nr_frags;
- lro_desc->parent = skb;
- lro_desc->next_frag = &(skb_shinfo(skb)->frags[nr_frags]);
- lro_desc->iph = iph;
- lro_desc->tcph = tcph;
- lro_desc->tcp_next_seq = ntohl(tcph->seq) + tcp_data_len;
- lro_desc->tcp_ack = tcph->ack_seq;
- lro_desc->tcp_window = tcph->window;
-
- lro_desc->pkt_aggr_cnt = 1;
- lro_desc->ip_tot_len = ntohs(iph->tot_len);
-
- if (tcph->doff == 8) {
- ptr = (__be32 *)(tcph+1);
- lro_desc->tcp_saw_tstamp = 1;
- lro_desc->tcp_rcv_tsval = *(ptr+1);
- lro_desc->tcp_rcv_tsecr = *(ptr+2);
- }
-
- lro_desc->mss = tcp_data_len;
- lro_desc->active = 1;
-
- lro_desc->data_csum = lro_tcp_data_csum(iph, tcph,
- tcp_data_len);
-}
-
-static inline void lro_clear_desc(struct net_lro_desc *lro_desc)
-{
- memset(lro_desc, 0, sizeof(struct net_lro_desc));
-}
-
-static void lro_add_common(struct net_lro_desc *lro_desc, struct iphdr *iph,
- struct tcphdr *tcph, int tcp_data_len)
-{
- struct sk_buff *parent = lro_desc->parent;
- __be32 *topt;
-
- lro_desc->pkt_aggr_cnt++;
- lro_desc->ip_tot_len += tcp_data_len;
- lro_desc->tcp_next_seq += tcp_data_len;
- lro_desc->tcp_window = tcph->window;
- lro_desc->tcp_ack = tcph->ack_seq;
-
- /* don't update tcp_rcv_tsval, would not work with PAWS */
- if (lro_desc->tcp_saw_tstamp) {
- topt = (__be32 *) (tcph + 1);
- lro_desc->tcp_rcv_tsecr = *(topt + 2);
- }
-
- lro_desc->data_csum = csum_block_add(lro_desc->data_csum,
- lro_tcp_data_csum(iph, tcph,
- tcp_data_len),
- parent->len);
-
- parent->len += tcp_data_len;
- parent->data_len += tcp_data_len;
- if (tcp_data_len > lro_desc->mss)
- lro_desc->mss = tcp_data_len;
-}
-
-static void lro_add_packet(struct net_lro_desc *lro_desc, struct sk_buff *skb,
- struct iphdr *iph, struct tcphdr *tcph)
-{
- struct sk_buff *parent = lro_desc->parent;
- int tcp_data_len = TCP_PAYLOAD_LENGTH(iph, tcph);
-
- lro_add_common(lro_desc, iph, tcph, tcp_data_len);
-
- skb_pull(skb, (skb->len - tcp_data_len));
- parent->truesize += skb->truesize;
-
- if (lro_desc->last_skb)
- lro_desc->last_skb->next = skb;
- else
- skb_shinfo(parent)->frag_list = skb;
-
- lro_desc->last_skb = skb;
-}
-
-
-static int lro_check_tcp_conn(struct net_lro_desc *lro_desc,
- struct iphdr *iph,
- struct tcphdr *tcph)
-{
- if ((lro_desc->iph->saddr != iph->saddr) ||
- (lro_desc->iph->daddr != iph->daddr) ||
- (lro_desc->tcph->source != tcph->source) ||
- (lro_desc->tcph->dest != tcph->dest))
- return -1;
- return 0;
-}
-
-static struct net_lro_desc *lro_get_desc(struct net_lro_mgr *lro_mgr,
- struct net_lro_desc *lro_arr,
- struct iphdr *iph,
- struct tcphdr *tcph)
-{
- struct net_lro_desc *lro_desc = NULL;
- struct net_lro_desc *tmp;
- int max_desc = lro_mgr->max_desc;
- int i;
-
- for (i = 0; i < max_desc; i++) {
- tmp = &lro_arr[i];
- if (tmp->active)
- if (!lro_check_tcp_conn(tmp, iph, tcph)) {
- lro_desc = tmp;
- goto out;
- }
- }
-
- for (i = 0; i < max_desc; i++) {
- if (!lro_arr[i].active) {
- lro_desc = &lro_arr[i];
- goto out;
- }
- }
-
- LRO_INC_STATS(lro_mgr, no_desc);
-out:
- return lro_desc;
-}
-
-static void lro_flush(struct net_lro_mgr *lro_mgr,
- struct net_lro_desc *lro_desc)
-{
- if (lro_desc->pkt_aggr_cnt > 1)
- lro_update_tcp_ip_header(lro_desc);
-
- skb_shinfo(lro_desc->parent)->gso_size = lro_desc->mss;
-
- if (lro_mgr->features & LRO_F_NAPI)
- netif_receive_skb(lro_desc->parent);
- else
- netif_rx(lro_desc->parent);
-
- LRO_INC_STATS(lro_mgr, flushed);
- lro_clear_desc(lro_desc);
-}
-
-static int __lro_proc_skb(struct net_lro_mgr *lro_mgr, struct sk_buff *skb,
- void *priv)
-{
- struct net_lro_desc *lro_desc;
- struct iphdr *iph;
- struct tcphdr *tcph;
- u64 flags;
- int vlan_hdr_len = 0;
-
- if (!lro_mgr->get_skb_header ||
- lro_mgr->get_skb_header(skb, (void *)&iph, (void *)&tcph,
- &flags, priv))
- goto out;
-
- if (!(flags & LRO_IPV4) || !(flags & LRO_TCP))
- goto out;
-
- lro_desc = lro_get_desc(lro_mgr, lro_mgr->lro_arr, iph, tcph);
- if (!lro_desc)
- goto out;
-
- if ((skb->protocol == htons(ETH_P_8021Q)) &&
- !(lro_mgr->features & LRO_F_EXTRACT_VLAN_ID))
- vlan_hdr_len = VLAN_HLEN;
-
- if (!lro_desc->active) { /* start new lro session */
- if (lro_tcp_ip_check(iph, tcph, skb->len - vlan_hdr_len, NULL))
- goto out;
-
- skb->ip_summed = lro_mgr->ip_summed_aggr;
- lro_init_desc(lro_desc, skb, iph, tcph);
- LRO_INC_STATS(lro_mgr, aggregated);
- return 0;
- }
-
- if (lro_desc->tcp_next_seq != ntohl(tcph->seq))
- goto out2;
-
- if (lro_tcp_ip_check(iph, tcph, skb->len, lro_desc))
- goto out2;
-
- lro_add_packet(lro_desc, skb, iph, tcph);
- LRO_INC_STATS(lro_mgr, aggregated);
-
- if ((lro_desc->pkt_aggr_cnt >= lro_mgr->max_aggr) ||
- lro_desc->parent->len > (0xFFFF - lro_mgr->dev->mtu))
- lro_flush(lro_mgr, lro_desc);
-
- return 0;
-
-out2: /* send aggregated SKBs to stack */
- lro_flush(lro_mgr, lro_desc);
-
-out:
- return 1;
-}
-
-void lro_receive_skb(struct net_lro_mgr *lro_mgr,
- struct sk_buff *skb,
- void *priv)
-{
- if (__lro_proc_skb(lro_mgr, skb, priv)) {
- if (lro_mgr->features & LRO_F_NAPI)
- netif_receive_skb(skb);
- else
- netif_rx(skb);
- }
-}
-EXPORT_SYMBOL(lro_receive_skb);
-
-void lro_flush_all(struct net_lro_mgr *lro_mgr)
-{
- int i;
- struct net_lro_desc *lro_desc = lro_mgr->lro_arr;
-
- for (i = 0; i < lro_mgr->max_desc; i++) {
- if (lro_desc[i].active)
- lro_flush(lro_mgr, &lro_desc[i]);
- }
-}
-EXPORT_SYMBOL(lro_flush_all);
diff --git a/net/ipv4/ip_forward.c b/net/ipv4/ip_forward.c
index da0d7ce85844..af18f1e4889e 100644
--- a/net/ipv4/ip_forward.c
+++ b/net/ipv4/ip_forward.c
@@ -71,7 +71,6 @@ static int ip_forward_finish(struct net *net, struct sock *sk, struct sk_buff *s
if (unlikely(opt->optlen))
ip_forward_options(skb);
- skb_sender_cpu_clear(skb);
return dst_output(net, sk, skb);
}
diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c
index 1fe55ae81781..efbd47d1a531 100644
--- a/net/ipv4/ip_fragment.c
+++ b/net/ipv4/ip_fragment.c
@@ -54,8 +54,6 @@
* code now. If you change something here, _PLEASE_ update ipv6/reassembly.c
* as well. Or notify me, at least. --ANK
*/
-
-static int sysctl_ipfrag_max_dist __read_mostly = 64;
static const char ip_frag_cache_name[] = "ip4-frags";
struct ipfrag_skb_cb
@@ -150,7 +148,7 @@ static void ip4_frag_init(struct inet_frag_queue *q, const void *a)
qp->daddr = arg->iph->daddr;
qp->vif = arg->vif;
qp->user = arg->user;
- qp->peer = sysctl_ipfrag_max_dist ?
+ qp->peer = q->net->max_dist ?
inet_getpeer_v4(net->ipv4.peers, arg->iph->saddr, arg->vif, 1) :
NULL;
}
@@ -275,7 +273,7 @@ static struct ipq *ip_find(struct net *net, struct iphdr *iph,
static int ip_frag_too_far(struct ipq *qp)
{
struct inet_peer *peer = qp->peer;
- unsigned int max = sysctl_ipfrag_max_dist;
+ unsigned int max = qp->q.net->max_dist;
unsigned int start, end;
int rc;
@@ -661,6 +659,7 @@ int ip_defrag(struct net *net, struct sk_buff *skb, u32 user)
struct ipq *qp;
IP_INC_STATS_BH(net, IPSTATS_MIB_REASMREQDS);
+ skb_orphan(skb);
/* Lookup (or create) queue header */
qp = ip_find(net, ip_hdr(skb), user, vif);
@@ -748,6 +747,14 @@ static struct ctl_table ip4_frags_ns_ctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
+ {
+ .procname = "ipfrag_max_dist",
+ .data = &init_net.ipv4.frags.max_dist,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero
+ },
{ }
};
@@ -761,14 +768,6 @@ static struct ctl_table ip4_frags_ctl_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
},
- {
- .procname = "ipfrag_max_dist",
- .data = &sysctl_ipfrag_max_dist,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &zero
- },
{ }
};
@@ -789,10 +788,7 @@ static int __net_init ip4_frags_ns_ctl_register(struct net *net)
table[1].data = &net->ipv4.frags.low_thresh;
table[1].extra2 = &net->ipv4.frags.high_thresh;
table[2].data = &net->ipv4.frags.timeout;
-
- /* Don't export sysctls to unprivileged users */
- if (net->user_ns != &init_user_ns)
- table[0].procname = NULL;
+ table[3].data = &net->ipv4.frags.max_dist;
}
hdr = register_net_sysctl(net, "net/ipv4", table);
@@ -864,6 +860,8 @@ static int __net_init ipv4_frags_init_net(struct net *net)
*/
net->ipv4.frags.timeout = IP_FRAG_TIME;
+ net->ipv4.frags.max_dist = 64;
+
res = inet_frags_init_net(&net->ipv4.frags);
if (res)
return res;
@@ -891,7 +889,6 @@ void __init ipfrag_init(void)
ip4_frags.hashfn = ip4_hashfn;
ip4_frags.constructor = ip4_frag_init;
ip4_frags.destructor = ip4_frag_free;
- ip4_frags.skb_free = NULL;
ip4_frags.qsize = sizeof(struct ipq);
ip4_frags.match = ip4_frag_match;
ip4_frags.frag_expire = ip_expire;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 614521437e30..af5d1f38217f 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -24,7 +24,6 @@
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
-#include <linux/mroute.h>
#include <linux/if_vlan.h>
#include <linux/init.h>
#include <linux/in6.h>
@@ -239,7 +238,7 @@ static int parse_gre_header(struct sk_buff *skb, struct tnl_ptk_info *tpi,
return -EINVAL;
}
}
- return iptunnel_pull_header(skb, hdr_len, tpi->proto);
+ return iptunnel_pull_header(skb, hdr_len, tpi->proto, false);
}
static void ipgre_err(struct sk_buff *skb, u32 info,
@@ -441,6 +440,17 @@ drop:
return 0;
}
+static __sum16 gre_checksum(struct sk_buff *skb)
+{
+ __wsum csum;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ csum = lco_csum(skb);
+ else
+ csum = skb_checksum(skb, 0, skb->len, 0);
+ return csum_fold(csum);
+}
+
static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
__be16 proto, __be32 key, __be32 seq)
{
@@ -468,8 +478,7 @@ static void build_header(struct sk_buff *skb, int hdr_len, __be16 flags,
!(skb_shinfo(skb)->gso_type &
(SKB_GSO_GRE | SKB_GSO_GRE_CSUM))) {
*ptr = 0;
- *(__sum16 *)ptr = csum_fold(skb_checksum(skb, 0,
- skb->len, 0));
+ *(__sum16 *)ptr = gre_checksum(skb);
}
}
}
@@ -494,8 +503,7 @@ static void __gre_xmit(struct sk_buff *skb, struct net_device *dev,
static struct sk_buff *gre_handle_offloads(struct sk_buff *skb,
bool csum)
{
- return iptunnel_handle_offloads(skb, csum,
- csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
+ return iptunnel_handle_offloads(skb, csum ? SKB_GSO_GRE_CSUM : SKB_GSO_GRE);
}
static struct rtable *gre_get_rt(struct sk_buff *skb,
@@ -519,11 +527,12 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct ip_tunnel_info *tun_info;
const struct ip_tunnel_key *key;
+ struct rtable *rt = NULL;
struct flowi4 fl;
- struct rtable *rt;
int min_headroom;
int tunnel_hlen;
__be16 df, flags;
+ bool use_cache;
int err;
tun_info = skb_tunnel_info(skb);
@@ -532,9 +541,17 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
goto err_free_skb;
key = &tun_info->key;
- rt = gre_get_rt(skb, dev, &fl, key);
- if (IS_ERR(rt))
- goto err_free_skb;
+ use_cache = ip_tunnel_dst_cache_usable(skb, tun_info);
+ if (use_cache)
+ rt = dst_cache_get_ip4(&tun_info->dst_cache, &fl.saddr);
+ if (!rt) {
+ rt = gre_get_rt(skb, dev, &fl, key);
+ if (IS_ERR(rt))
+ goto err_free_skb;
+ if (use_cache)
+ dst_cache_set_ip4(&tun_info->dst_cache, &rt->dst,
+ fl.saddr);
+ }
tunnel_hlen = ip_gre_calc_hlen(key->tun_flags);
@@ -562,10 +579,9 @@ static void gre_fb_xmit(struct sk_buff *skb, struct net_device *dev)
tunnel_id_to_key(tun_info->key.tun_id), 0);
df = key->tun_flags & TUNNEL_DONT_FRAGMENT ? htons(IP_DF) : 0;
- err = iptunnel_xmit(skb->sk, rt, skb, fl.saddr,
- key->u.ipv4.dst, IPPROTO_GRE,
- key->tos, key->ttl, df, false);
- iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+
+ iptunnel_xmit(skb->sk, rt, skb, fl.saddr, key->u.ipv4.dst, IPPROTO_GRE,
+ key->tos, key->ttl, df, false);
return;
err_free_rt:
@@ -846,9 +862,16 @@ static void __gre_tunnel_init(struct net_device *dev)
dev->hw_features |= GRE_FEATURES;
if (!(tunnel->parms.o_flags & TUNNEL_SEQ)) {
- /* TCP offload with GRE SEQ is not supported. */
- dev->features |= NETIF_F_GSO_SOFTWARE;
- dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ /* TCP offload with GRE SEQ is not supported, nor
+ * can we support 2 levels of outer headers requiring
+ * an update.
+ */
+ if (!(tunnel->parms.o_flags & TUNNEL_CSUM) ||
+ (tunnel->encap.type == TUNNEL_ENCAP_NONE)) {
+ dev->features |= NETIF_F_GSO_SOFTWARE;
+ dev->hw_features |= NETIF_F_GSO_SOFTWARE;
+ }
+
/* Can use a lockless transmit, unless we generate
* output sequences
*/
@@ -1056,8 +1079,9 @@ static const struct net_device_ops gre_tap_netdev_ops = {
static void ipgre_tap_setup(struct net_device *dev)
{
ether_setup(dev);
- dev->netdev_ops = &gre_tap_netdev_ops;
- dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
+ dev->netdev_ops = &gre_tap_netdev_ops;
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
+ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
ip_tunnel_setup(dev, gre_tap_net_id);
}
@@ -1242,6 +1266,14 @@ struct net_device *gretap_fb_dev_create(struct net *net, const char *name,
err = ipgre_newlink(net, dev, tb, NULL);
if (err < 0)
goto out;
+
+ /* openvswitch users expect packet sizes to be unrestricted,
+ * so set the largest MTU we can.
+ */
+ err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
+ if (err)
+ goto out;
+
return dev;
out:
free_netdev(dev);
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index b1209b63381f..e3d782746d9d 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -308,15 +308,15 @@ drop:
return true;
}
-int sysctl_ip_early_demux __read_mostly = 1;
-EXPORT_SYMBOL(sysctl_ip_early_demux);
-
static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
const struct iphdr *iph = ip_hdr(skb);
struct rtable *rt;
- if (sysctl_ip_early_demux && !skb_dst(skb) && !skb->sk) {
+ if (net->ipv4.sysctl_ip_early_demux &&
+ !skb_dst(skb) &&
+ !skb->sk &&
+ !ip_is_fragment(iph)) {
const struct net_protocol *ipprot;
int protocol = iph->protocol;
@@ -359,8 +359,31 @@ static int ip_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
rt = skb_rtable(skb);
if (rt->rt_type == RTN_MULTICAST) {
IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INMCAST, skb->len);
- } else if (rt->rt_type == RTN_BROADCAST)
+ } else if (rt->rt_type == RTN_BROADCAST) {
IP_UPD_PO_STATS_BH(net, IPSTATS_MIB_INBCAST, skb->len);
+ } else if (skb->pkt_type == PACKET_BROADCAST ||
+ skb->pkt_type == PACKET_MULTICAST) {
+ struct in_device *in_dev = __in_dev_get_rcu(skb->dev);
+
+ /* RFC 1122 3.3.6:
+ *
+ * When a host sends a datagram to a link-layer broadcast
+ * address, the IP destination address MUST be a legal IP
+ * broadcast or IP multicast address.
+ *
+ * A host SHOULD silently discard a datagram that is received
+ * via a link-layer broadcast (see Section 2.4) but does not
+ * specify an IP multicast or broadcast destination address.
+ *
+ * This doesn't explicitly say L2 *broadcast*, but broadcast is
+ * in a way a form of multicast and the most common use case for
+ * this is 802.11 protecting against cross-station spoofing (the
+ * so-called "hole-196" attack) so do it for both.
+ */
+ if (in_dev &&
+ IN_DEV_ORCONF(in_dev, DROP_UNICAST_IN_L2_MULTICAST))
+ goto drop;
+ }
return dst_input(skb);
diff --git a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c
index bd246792360b..4d158ff1def1 100644
--- a/net/ipv4/ip_options.c
+++ b/net/ipv4/ip_options.c
@@ -58,10 +58,9 @@ void ip_options_build(struct sk_buff *skb, struct ip_options *opt,
if (opt->ts_needaddr)
ip_rt_get_source(iph+opt->ts+iph[opt->ts+2]-9, skb, rt);
if (opt->ts_needtime) {
- struct timespec tv;
__be32 midtime;
- getnstimeofday(&tv);
- midtime = htonl((tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC);
+
+ midtime = inet_current_timestamp();
memcpy(iph+opt->ts+iph[opt->ts+2]-5, &midtime, 4);
}
return;
@@ -415,11 +414,10 @@ int ip_options_compile(struct net *net,
break;
}
if (timeptr) {
- struct timespec tv;
- u32 midtime;
- getnstimeofday(&tv);
- midtime = (tv.tv_sec % 86400) * MSEC_PER_SEC + tv.tv_nsec / NSEC_PER_MSEC;
- put_unaligned_be32(midtime, timeptr);
+ __be32 midtime;
+
+ midtime = inet_current_timestamp();
+ memcpy(timeptr, &midtime, 4);
opt->is_changed = 1;
}
} else if ((optptr[3]&0xF) != IPOPT_TS_PRESPEC) {
diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c
index 4233cbe47052..124bf0a66328 100644
--- a/net/ipv4/ip_output.c
+++ b/net/ipv4/ip_output.c
@@ -76,13 +76,9 @@
#include <linux/igmp.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
-#include <linux/mroute.h>
#include <linux/netlink.h>
#include <linux/tcp.h>
-int sysctl_ip_default_ttl __read_mostly = IPDEFTTL;
-EXPORT_SYMBOL(sysctl_ip_default_ttl);
-
static int
ip_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
unsigned int mtu,
@@ -240,6 +236,7 @@ static int ip_finish_output_gso(struct net *net, struct sock *sk,
* from host network stack.
*/
features = netif_skb_features(skb);
+ BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
if (IS_ERR_OR_NULL(segs)) {
kfree_skb(skb);
@@ -912,7 +909,7 @@ static int __ip_append_data(struct sock *sk,
*/
if (transhdrlen &&
length + fragheaderlen <= mtu &&
- rt->dst.dev->features & NETIF_F_V4_CSUM &&
+ rt->dst.dev->features & (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM) &&
!(flags & MSG_MORE) &&
!exthdrlen)
csummode = CHECKSUM_PARTIAL;
@@ -921,7 +918,7 @@ static int __ip_append_data(struct sock *sk,
if (((length > mtu) || (skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO) && !rt->dst.header_len &&
- (sk->sk_type == SOCK_DGRAM)) {
+ (sk->sk_type == SOCK_DGRAM) && !sk->sk_no_check_tx) {
err = ip_ufo_append_data(sk, queue, getfrag, from, length,
hh_len, fragheaderlen, transhdrlen,
maxfraglen, flags);
@@ -1236,13 +1233,16 @@ ssize_t ip_append_page(struct sock *sk, struct flowi4 *fl4, struct page *page,
if (!skb)
return -EINVAL;
- cork->length += size;
if ((size + skb->len > mtu) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO)) {
+ if (skb->ip_summed != CHECKSUM_PARTIAL)
+ return -EOPNOTSUPP;
+
skb_shinfo(skb)->gso_size = mtu - fragheaderlen;
skb_shinfo(skb)->gso_type = SKB_GSO_UDP;
}
+ cork->length += size;
while (size > 0) {
if (skb_is_gso(skb)) {
diff --git a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c
index c3c359ad66e3..035ad645a8d9 100644
--- a/net/ipv4/ip_sockglue.c
+++ b/net/ipv4/ip_sockglue.c
@@ -249,6 +249,8 @@ int ip_cmsg_send(struct net *net, struct msghdr *msg, struct ipcm_cookie *ipc,
switch (cmsg->cmsg_type) {
case IP_RETOPTS:
err = cmsg->cmsg_len - CMSG_ALIGN(sizeof(struct cmsghdr));
+
+ /* Our caller is responsible for freeing ipc->opt */
err = ip_options_get(net, &ipc->opt, CMSG_DATA(cmsg),
err < 40 ? err : 40);
if (err)
@@ -571,6 +573,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
int optname, char __user *optval, unsigned int optlen)
{
struct inet_sock *inet = inet_sk(sk);
+ struct net *net = sock_net(sk);
int val = 0, err;
bool needs_rtnl = setsockopt_needs_rtnl(optname);
@@ -910,7 +913,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
}
/* numsrc >= (1G-4) overflow in 32 bits */
if (msf->imsf_numsrc >= 0x3ffffffcU ||
- msf->imsf_numsrc > sysctl_igmp_max_msf) {
+ msf->imsf_numsrc > net->ipv4.sysctl_igmp_max_msf) {
kfree(msf);
err = -ENOBUFS;
break;
@@ -1065,7 +1068,7 @@ static int do_ip_setsockopt(struct sock *sk, int level,
/* numsrc >= (4G-140)/128 overflow in 32 bits */
if (gsf->gf_numsrc >= 0x1ffffff ||
- gsf->gf_numsrc > sysctl_igmp_max_msf) {
+ gsf->gf_numsrc > net->ipv4.sysctl_igmp_max_msf) {
err = -ENOBUFS;
goto mc_msf_out;
}
@@ -1251,11 +1254,22 @@ EXPORT_SYMBOL(compat_ip_setsockopt);
* the _received_ ones. The set sets the _sent_ ones.
*/
+static bool getsockopt_needs_rtnl(int optname)
+{
+ switch (optname) {
+ case IP_MSFILTER:
+ case MCAST_MSFILTER:
+ return true;
+ }
+ return false;
+}
+
static int do_ip_getsockopt(struct sock *sk, int level, int optname,
char __user *optval, int __user *optlen, unsigned int flags)
{
struct inet_sock *inet = inet_sk(sk);
- int val;
+ bool needs_rtnl = getsockopt_needs_rtnl(optname);
+ int val, err = 0;
int len;
if (level != SOL_IP)
@@ -1269,6 +1283,8 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
if (len < 0)
return -EINVAL;
+ if (needs_rtnl)
+ rtnl_lock();
lock_sock(sk);
switch (optname) {
@@ -1327,10 +1343,13 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
val = inet->tos;
break;
case IP_TTL:
+ {
+ struct net *net = sock_net(sk);
val = (inet->uc_ttl == -1 ?
- sysctl_ip_default_ttl :
+ net->ipv4.sysctl_ip_default_ttl :
inet->uc_ttl);
break;
+ }
case IP_HDRINCL:
val = inet->hdrincl;
break;
@@ -1386,39 +1405,35 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
case IP_MSFILTER:
{
struct ip_msfilter msf;
- int err;
if (len < IP_MSFILTER_SIZE(0)) {
- release_sock(sk);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
if (copy_from_user(&msf, optval, IP_MSFILTER_SIZE(0))) {
- release_sock(sk);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
err = ip_mc_msfget(sk, &msf,
(struct ip_msfilter __user *)optval, optlen);
- release_sock(sk);
- return err;
+ goto out;
}
case MCAST_MSFILTER:
{
struct group_filter gsf;
- int err;
if (len < GROUP_FILTER_SIZE(0)) {
- release_sock(sk);
- return -EINVAL;
+ err = -EINVAL;
+ goto out;
}
if (copy_from_user(&gsf, optval, GROUP_FILTER_SIZE(0))) {
- release_sock(sk);
- return -EFAULT;
+ err = -EFAULT;
+ goto out;
}
err = ip_mc_gsfget(sk, &gsf,
(struct group_filter __user *)optval,
optlen);
- release_sock(sk);
- return err;
+ goto out;
}
case IP_MULTICAST_ALL:
val = inet->mc_all;
@@ -1485,6 +1500,12 @@ static int do_ip_getsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
}
return 0;
+
+out:
+ release_sock(sk);
+ if (needs_rtnl)
+ rtnl_unlock();
+ return err;
}
int ip_getsockopt(struct sock *sk, int level,
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index cbb51f3fac06..6aad0192443d 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -30,7 +30,6 @@
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
-#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
@@ -69,61 +68,6 @@ static unsigned int ip_tunnel_hash(__be32 key, __be32 remote)
IP_TNL_HASH_BITS);
}
-static void __tunnel_dst_set(struct ip_tunnel_dst *idst,
- struct dst_entry *dst, __be32 saddr)
-{
- struct dst_entry *old_dst;
-
- dst_clone(dst);
- old_dst = xchg((__force struct dst_entry **)&idst->dst, dst);
- dst_release(old_dst);
- idst->saddr = saddr;
-}
-
-static noinline void tunnel_dst_set(struct ip_tunnel *t,
- struct dst_entry *dst, __be32 saddr)
-{
- __tunnel_dst_set(raw_cpu_ptr(t->dst_cache), dst, saddr);
-}
-
-static void tunnel_dst_reset(struct ip_tunnel *t)
-{
- tunnel_dst_set(t, NULL, 0);
-}
-
-void ip_tunnel_dst_reset_all(struct ip_tunnel *t)
-{
- int i;
-
- for_each_possible_cpu(i)
- __tunnel_dst_set(per_cpu_ptr(t->dst_cache, i), NULL, 0);
-}
-EXPORT_SYMBOL(ip_tunnel_dst_reset_all);
-
-static struct rtable *tunnel_rtable_get(struct ip_tunnel *t,
- u32 cookie, __be32 *saddr)
-{
- struct ip_tunnel_dst *idst;
- struct dst_entry *dst;
-
- rcu_read_lock();
- idst = raw_cpu_ptr(t->dst_cache);
- dst = rcu_dereference(idst->dst);
- if (dst && !atomic_inc_not_zero(&dst->__refcnt))
- dst = NULL;
- if (dst) {
- if (!dst->obsolete || dst->ops->check(dst, cookie)) {
- *saddr = idst->saddr;
- } else {
- tunnel_dst_reset(t);
- dst_release(dst);
- dst = NULL;
- }
- }
- rcu_read_unlock();
- return (struct rtable *)dst;
-}
-
static bool ip_tunnel_key_match(const struct ip_tunnel_parm *p,
__be16 flags, __be32 key)
{
@@ -382,7 +326,8 @@ static int ip_tunnel_bind_dev(struct net_device *dev)
if (!IS_ERR(rt)) {
tdev = rt->dst.dev;
- tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
+ dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
+ fl4.saddr);
ip_rt_put(rt);
}
if (dev->type != ARPHRD_ETHER)
@@ -657,12 +602,13 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
struct rtable *rt; /* Route to the other host */
unsigned int max_headroom; /* The extra header space needed */
__be32 dst;
- int err;
bool connected;
inner_iph = (const struct iphdr *)skb_inner_network_header(skb);
connected = (tunnel->parms.iph.daddr != 0);
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
dst = tnl_params->daddr;
if (dst == 0) {
/* NBMA tunnel */
@@ -731,7 +677,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
if (ip_tunnel_encap(skb, tunnel, &protocol, &fl4) < 0)
goto tx_error;
- rt = connected ? tunnel_rtable_get(tunnel, 0, &fl4.saddr) : NULL;
+ rt = connected ? dst_cache_get_ip4(&tunnel->dst_cache, &fl4.saddr) :
+ NULL;
if (!rt) {
rt = ip_route_output_key(tunnel->net, &fl4);
@@ -741,7 +688,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
goto tx_error;
}
if (connected)
- tunnel_dst_set(tunnel, &rt->dst, fl4.saddr);
+ dst_cache_set_ip4(&tunnel->dst_cache, &rt->dst,
+ fl4.saddr);
}
if (rt->dst.dev == dev) {
@@ -760,7 +708,6 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
tunnel->err_time + IPTUNNEL_ERR_TIMEO)) {
tunnel->err_count--;
- memset(IPCB(skb), 0, sizeof(*IPCB(skb)));
dst_link_failure(skb);
} else
tunnel->err_count = 0;
@@ -795,10 +742,8 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev,
return;
}
- err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol,
- tos, ttl, df, !net_eq(tunnel->net, dev_net(dev)));
- iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
-
+ iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
+ df, !net_eq(tunnel->net, dev_net(dev)));
return;
#if IS_ENABLED(CONFIG_IPV6)
@@ -840,7 +785,7 @@ static void ip_tunnel_update(struct ip_tunnel_net *itn,
if (set_mtu)
dev->mtu = mtu;
}
- ip_tunnel_dst_reset_all(t);
+ dst_cache_reset(&t->dst_cache);
netdev_state_change(dev);
}
@@ -947,17 +892,31 @@ done:
}
EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
-int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
+ int max_mtu = 0xFFF8 - dev->hard_header_len - t_hlen;
- if (new_mtu < 68 ||
- new_mtu > 0xFFF8 - dev->hard_header_len - t_hlen)
+ if (new_mtu < 68)
return -EINVAL;
+
+ if (new_mtu > max_mtu) {
+ if (strict)
+ return -EINVAL;
+
+ new_mtu = max_mtu;
+ }
+
dev->mtu = new_mtu;
return 0;
}
+EXPORT_SYMBOL_GPL(__ip_tunnel_change_mtu);
+
+int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+{
+ return __ip_tunnel_change_mtu(dev, new_mtu, true);
+}
EXPORT_SYMBOL_GPL(ip_tunnel_change_mtu);
static void ip_tunnel_dev_free(struct net_device *dev)
@@ -965,7 +924,7 @@ static void ip_tunnel_dev_free(struct net_device *dev)
struct ip_tunnel *tunnel = netdev_priv(dev);
gro_cells_destroy(&tunnel->gro_cells);
- free_percpu(tunnel->dst_cache);
+ dst_cache_destroy(&tunnel->dst_cache);
free_percpu(dev->tstats);
free_netdev(dev);
}
@@ -1159,15 +1118,15 @@ int ip_tunnel_init(struct net_device *dev)
if (!dev->tstats)
return -ENOMEM;
- tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
- if (!tunnel->dst_cache) {
+ err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+ if (err) {
free_percpu(dev->tstats);
- return -ENOMEM;
+ return err;
}
err = gro_cells_init(&tunnel->gro_cells, dev);
if (err) {
- free_percpu(tunnel->dst_cache);
+ dst_cache_destroy(&tunnel->dst_cache);
free_percpu(dev->tstats);
return err;
}
@@ -1197,7 +1156,7 @@ void ip_tunnel_uninit(struct net_device *dev)
if (itn->fb_tunnel_dev != dev)
ip_tunnel_del(itn, netdev_priv(dev));
- ip_tunnel_dst_reset_all(tunnel);
+ dst_cache_reset(&tunnel->dst_cache);
}
EXPORT_SYMBOL_GPL(ip_tunnel_uninit);
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 6cb9009c3d96..6165f30c4d72 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_core.c
@@ -24,7 +24,6 @@
#include <linux/netdevice.h>
#include <linux/in.h>
#include <linux/if_arp.h>
-#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
@@ -48,12 +47,13 @@
#include <net/rtnetlink.h>
#include <net/dst_metadata.h>
-int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
- __be32 src, __be32 dst, __u8 proto,
- __u8 tos, __u8 ttl, __be16 df, bool xnet)
+void iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
+ __be32 src, __be32 dst, __u8 proto,
+ __u8 tos, __u8 ttl, __be16 df, bool xnet)
{
int pkt_len = skb->len - skb_inner_network_offset(skb);
struct net *net = dev_net(rt->dst.dev);
+ struct net_device *dev = skb->dev;
struct iphdr *iph;
int err;
@@ -82,11 +82,12 @@ int iptunnel_xmit(struct sock *sk, struct rtable *rt, struct sk_buff *skb,
err = ip_local_out(net, sk, skb);
if (unlikely(net_xmit_eval(err)))
pkt_len = 0;
- return pkt_len;
+ iptunnel_xmit_stats(dev, pkt_len);
}
EXPORT_SYMBOL_GPL(iptunnel_xmit);
-int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
+int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto,
+ bool xnet)
{
if (unlikely(!pskb_may_pull(skb, hdr_len)))
return -ENOMEM;
@@ -109,14 +110,12 @@ int iptunnel_pull_header(struct sk_buff *skb, int hdr_len, __be16 inner_proto)
skb->protocol = inner_proto;
}
- nf_reset(skb);
- secpath_reset(skb);
skb_clear_hash_if_not_l4(skb);
- skb_dst_drop(skb);
skb->vlan_tci = 0;
skb_set_queue_mapping(skb, 0);
- skb->pkt_type = PACKET_HOST;
- return 0;
+ skb_scrub_packet(skb, xnet);
+
+ return iptunnel_pull_offloads(skb);
}
EXPORT_SYMBOL_GPL(iptunnel_pull_header);
@@ -148,7 +147,6 @@ struct metadata_dst *iptunnel_metadata_reply(struct metadata_dst *md,
EXPORT_SYMBOL_GPL(iptunnel_metadata_reply);
struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
- bool csum_help,
int gso_type_mask)
{
int err;
@@ -166,20 +164,15 @@ struct sk_buff *iptunnel_handle_offloads(struct sk_buff *skb,
return skb;
}
- /* If packet is not gso and we are resolving any partial checksum,
- * clear encapsulation flag. This allows setting CHECKSUM_PARTIAL
- * on the outer header without confusing devices that implement
- * NETIF_F_IP_CSUM with encapsulation.
- */
- if (csum_help)
- skb->encapsulation = 0;
-
- if (skb->ip_summed == CHECKSUM_PARTIAL && csum_help) {
- err = skb_checksum_help(skb);
- if (unlikely(err))
- goto error;
- } else if (skb->ip_summed != CHECKSUM_PARTIAL)
+ if (skb->ip_summed != CHECKSUM_PARTIAL) {
skb->ip_summed = CHECKSUM_NONE;
+ /* We clear encapsulation here to prevent badly-written
+ * drivers potentially deciding to offload an inner checksum
+ * if we set CHECKSUM_PARTIAL on the outer header.
+ * This should go away when the drivers are all fixed.
+ */
+ skb->encapsulation = 0;
+ }
return skb;
error:
@@ -251,7 +244,7 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
tun_info = lwt_tun_info(new_state);
if (tb[LWTUNNEL_IP_ID])
- tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP_ID]);
+ tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP_ID]);
if (tb[LWTUNNEL_IP_DST])
tun_info->key.u.ipv4.dst = nla_get_be32(tb[LWTUNNEL_IP_DST]);
@@ -266,7 +259,7 @@ static int ip_tun_build_state(struct net_device *dev, struct nlattr *attr,
tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP_TOS]);
if (tb[LWTUNNEL_IP_FLAGS])
- tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP_FLAGS]);
+ tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP_FLAGS]);
tun_info->mode = IP_TUNNEL_INFO_TX;
tun_info->options_len = 0;
@@ -281,12 +274,12 @@ static int ip_tun_fill_encap_info(struct sk_buff *skb,
{
struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
- if (nla_put_u64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) ||
+ if (nla_put_be64(skb, LWTUNNEL_IP_ID, tun_info->key.tun_id) ||
nla_put_be32(skb, LWTUNNEL_IP_DST, tun_info->key.u.ipv4.dst) ||
nla_put_be32(skb, LWTUNNEL_IP_SRC, tun_info->key.u.ipv4.src) ||
nla_put_u8(skb, LWTUNNEL_IP_TOS, tun_info->key.tos) ||
nla_put_u8(skb, LWTUNNEL_IP_TTL, tun_info->key.ttl) ||
- nla_put_u16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
+ nla_put_be16(skb, LWTUNNEL_IP_FLAGS, tun_info->key.tun_flags))
return -ENOMEM;
return 0;
@@ -346,7 +339,7 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,
tun_info = lwt_tun_info(new_state);
if (tb[LWTUNNEL_IP6_ID])
- tun_info->key.tun_id = nla_get_u64(tb[LWTUNNEL_IP6_ID]);
+ tun_info->key.tun_id = nla_get_be64(tb[LWTUNNEL_IP6_ID]);
if (tb[LWTUNNEL_IP6_DST])
tun_info->key.u.ipv6.dst = nla_get_in6_addr(tb[LWTUNNEL_IP6_DST]);
@@ -361,7 +354,7 @@ static int ip6_tun_build_state(struct net_device *dev, struct nlattr *attr,
tun_info->key.tos = nla_get_u8(tb[LWTUNNEL_IP6_TC]);
if (tb[LWTUNNEL_IP6_FLAGS])
- tun_info->key.tun_flags = nla_get_u16(tb[LWTUNNEL_IP6_FLAGS]);
+ tun_info->key.tun_flags = nla_get_be16(tb[LWTUNNEL_IP6_FLAGS]);
tun_info->mode = IP_TUNNEL_INFO_TX | IP_TUNNEL_INFO_IPV6;
tun_info->options_len = 0;
@@ -376,12 +369,12 @@ static int ip6_tun_fill_encap_info(struct sk_buff *skb,
{
struct ip_tunnel_info *tun_info = lwt_tun_info(lwtstate);
- if (nla_put_u64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) ||
+ if (nla_put_be64(skb, LWTUNNEL_IP6_ID, tun_info->key.tun_id) ||
nla_put_in6_addr(skb, LWTUNNEL_IP6_DST, &tun_info->key.u.ipv6.dst) ||
nla_put_in6_addr(skb, LWTUNNEL_IP6_SRC, &tun_info->key.u.ipv6.src) ||
- nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.tos) ||
- nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.ttl) ||
- nla_put_u16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags))
+ nla_put_u8(skb, LWTUNNEL_IP6_TC, tun_info->key.tos) ||
+ nla_put_u8(skb, LWTUNNEL_IP6_HOPLIMIT, tun_info->key.ttl) ||
+ nla_put_be16(skb, LWTUNNEL_IP6_FLAGS, tun_info->key.tun_flags))
return -ENOMEM;
return 0;
@@ -406,6 +399,12 @@ static const struct lwtunnel_encap_ops ip6_tun_lwt_ops = {
void __init ip_tunnel_core_init(void)
{
+ /* If you land here, make sure whether increasing ip_tunnel_info's
+ * options_len is a reasonable choice with its usage in front ends
+ * (f.e., it's part of flow keys, etc).
+ */
+ BUILD_BUG_ON(IP_TUNNEL_OPTS_MAX != 255);
+
lwtunnel_encap_add_ops(&ip_tun_lwt_ops, LWTUNNEL_ENCAP_IP);
lwtunnel_encap_add_ops(&ip6_tun_lwt_ops, LWTUNNEL_ENCAP_IP6);
}
diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 4d8f0b698777..5cf10b777b7e 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -30,7 +30,6 @@
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
-#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>
@@ -200,7 +199,7 @@ static netdev_tx_t vti_xmit(struct sk_buff *skb, struct net_device *dev,
err = dst_output(tunnel->net, skb->sk, skb);
if (net_xmit_eval(err) == 0)
err = skb->len;
- iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+ iptunnel_xmit_stats(dev, err);
return NETDEV_TX_OK;
tx_error_icmp:
diff --git a/net/ipv4/ipconfig.c b/net/ipv4/ipconfig.c
index 0bc7412d9e14..2ed9dd2b5f2f 100644
--- a/net/ipv4/ipconfig.c
+++ b/net/ipv4/ipconfig.c
@@ -65,15 +65,6 @@
#include <net/checksum.h>
#include <asm/processor.h>
-/* Define this to allow debugging output */
-#undef IPCONFIG_DEBUG
-
-#ifdef IPCONFIG_DEBUG
-#define DBG(x) printk x
-#else
-#define DBG(x) do { } while(0)
-#endif
-
#if defined(CONFIG_IP_PNP_DHCP)
#define IPCONFIG_DHCP
#endif
@@ -152,7 +143,11 @@ static char dhcp_client_identifier[253] __initdata;
/* Persistent data: */
+#ifdef IPCONFIG_DYNAMIC
static int ic_proto_used; /* Protocol used, if any */
+#else
+#define ic_proto_used 0
+#endif
static __be32 ic_nameservers[CONF_NAMESERVERS_MAX]; /* DNS Server IP addresses */
static u8 ic_domain[64]; /* DNS (not NIS) domain name */
@@ -227,7 +222,7 @@ static int __init ic_open_devs(void)
if (dev->mtu >= 364)
able |= IC_BOOTP;
else
- pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small",
+ pr_warn("DHCP/BOOTP: Ignoring device %s, MTU %d too small\n",
dev->name, dev->mtu);
if (!(dev->flags & IFF_NOARP))
able |= IC_RARP;
@@ -254,8 +249,8 @@ static int __init ic_open_devs(void)
else
d->xid = 0;
ic_proto_have_if |= able;
- DBG(("IP-Config: %s UP (able=%d, xid=%08x)\n",
- dev->name, able, d->xid));
+ pr_debug("IP-Config: %s UP (able=%d, xid=%08x)\n",
+ dev->name, able, d->xid);
}
}
@@ -311,7 +306,7 @@ static void __init ic_close_devs(void)
next = d->next;
dev = d->dev;
if (dev != ic_dev && !netdev_uses_dsa(dev)) {
- DBG(("IP-Config: Downing %s\n", dev->name));
+ pr_debug("IP-Config: Downing %s\n", dev->name);
dev_change_flags(dev, d->flags);
}
kfree(d);
@@ -464,7 +459,8 @@ static int __init ic_defaults(void)
&ic_myaddr);
return -1;
}
- printk("IP-Config: Guessing netmask %pI4\n", &ic_netmask);
+ pr_notice("IP-Config: Guessing netmask %pI4\n",
+ &ic_netmask);
}
return 0;
@@ -675,9 +671,7 @@ ic_dhcp_init_options(u8 *options)
u8 *e = options;
int len;
-#ifdef IPCONFIG_DEBUG
- printk("DHCP: Sending message type %d\n", mt);
-#endif
+ pr_debug("DHCP: Sending message type %d\n", mt);
memcpy(e, ic_bootp_cookie, 4); /* RFC1048 Magic Cookie */
e += 4;
@@ -847,7 +841,8 @@ static void __init ic_bootp_send_if(struct ic_device *d, unsigned long jiffies_d
else if (dev->type == ARPHRD_FDDI)
b->htype = ARPHRD_ETHER;
else {
- printk("Unknown ARP type 0x%04x for device %s\n", dev->type, dev->name);
+ pr_warn("Unknown ARP type 0x%04x for device %s\n", dev->type,
+ dev->name);
b->htype = dev->type; /* can cause undefined behavior */
}
@@ -904,14 +899,12 @@ static void __init ic_do_bootp_ext(u8 *ext)
int i;
__be16 mtu;
-#ifdef IPCONFIG_DEBUG
u8 *c;
- printk("DHCP/BOOTP: Got extension %d:",*ext);
+ pr_debug("DHCP/BOOTP: Got extension %d:", *ext);
for (c=ext+2; c<ext+2+ext[1]; c++)
- printk(" %02x", *c);
- printk("\n");
-#endif
+ pr_debug(" %02x", *c);
+ pr_debug("\n");
switch (*ext++) {
case 1: /* Subnet mask */
@@ -1080,9 +1073,7 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
}
}
-#ifdef IPCONFIG_DEBUG
- printk("DHCP: Got message type %d\n", mt);
-#endif
+ pr_debug("DHCP: Got message type %d\n", mt);
switch (mt) {
case DHCPOFFER:
@@ -1095,10 +1086,8 @@ static int __init ic_bootp_recv(struct sk_buff *skb, struct net_device *dev, str
/* Let's accept that offer. */
ic_myaddr = b->your_ip;
ic_servaddr = server_id;
-#ifdef IPCONFIG_DEBUG
- printk("DHCP: Offered address %pI4 by server %pI4\n",
- &ic_myaddr, &b->iph.saddr);
-#endif
+ pr_debug("DHCP: Offered address %pI4 by server %pI4\n",
+ &ic_myaddr, &b->iph.saddr);
/* The DHCP indicated server address takes
* precedence over the bootp header one if
* they are different.
@@ -1295,11 +1284,10 @@ static int __init ic_dynamic(void)
return -1;
}
- printk("IP-Config: Got %s answer from %pI4, ",
+ pr_info("IP-Config: Got %s answer from %pI4, my address is %pI4\n",
((ic_got_reply & IC_RARP) ? "RARP"
- : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
- &ic_addrservaddr);
- pr_cont("my address is %pI4\n", &ic_myaddr);
+ : (ic_proto_enabled & IC_USE_DHCP) ? "DHCP" : "BOOTP"),
+ &ic_addrservaddr, &ic_myaddr);
return 0;
}
@@ -1426,7 +1414,7 @@ static int __init ip_auto_config(void)
if (!ic_enable)
return 0;
- DBG(("IP-Config: Entered.\n"));
+ pr_debug("IP-Config: Entered.\n");
#ifdef IPCONFIG_DYNAMIC
try_try_again:
#endif
@@ -1542,7 +1530,7 @@ static int __init ip_auto_config(void)
pr_cont(", mtu=%d", ic_dev_mtu);
for (i = 0; i < CONF_NAMESERVERS_MAX; i++)
if (ic_nameservers[i] != NONE) {
- pr_info(" nameserver%u=%pI4",
+ pr_cont(" nameserver%u=%pI4",
i, &ic_nameservers[i]);
break;
}
@@ -1585,7 +1573,7 @@ static int __init ic_proto_name(char *name)
return 1;
*v = 0;
if (kstrtou8(client_id, 0, dhcp_client_identifier))
- DBG("DHCP: Invalid client identifier type\n");
+ pr_debug("DHCP: Invalid client identifier type\n");
strncpy(dhcp_client_identifier + 1, v + 1, 251);
*v = ',';
}
@@ -1644,7 +1632,7 @@ static int __init ip_auto_config_setup(char *addrs)
if ((cp = strchr(ip, ':')))
*cp++ = '\0';
if (strlen(ip) > 0) {
- DBG(("IP-Config: Parameter #%d: `%s'\n", num, ip));
+ pr_debug("IP-Config: Parameter #%d: `%s'\n", num, ip);
switch (num) {
case 0:
if ((ic_myaddr = in_aton(ip)) == ANY)
@@ -1716,7 +1704,7 @@ static int __init vendor_class_identifier_setup(char *addrs)
if (strlcpy(vendor_class_identifier, addrs,
sizeof(vendor_class_identifier))
>= sizeof(vendor_class_identifier))
- pr_warn("DHCP: vendorclass too long, truncated to \"%s\"",
+ pr_warn("DHCP: vendorclass too long, truncated to \"%s\"\n",
vendor_class_identifier);
return 1;
}
diff --git a/net/ipv4/ipip.c b/net/ipv4/ipip.c
index f34c31defafe..ec51d02166de 100644
--- a/net/ipv4/ipip.c
+++ b/net/ipv4/ipip.c
@@ -103,7 +103,6 @@
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
-#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/netfilter_ipv4.h>
#include <linux/if_ether.h>
@@ -196,7 +195,7 @@ static int ipip_rcv(struct sk_buff *skb)
if (tunnel) {
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
- if (iptunnel_pull_header(skb, 0, tpi.proto))
+ if (iptunnel_pull_header(skb, 0, tpi.proto, false))
goto drop;
return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
}
@@ -220,7 +219,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
if (unlikely(skb->protocol != htons(ETH_P_IP)))
goto tx_error;
- skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+ skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP);
if (IS_ERR(skb))
goto out;
@@ -253,9 +252,6 @@ ipip_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
p.i_key = p.o_key = 0;
p.i_flags = p.o_flags = 0;
- if (p.iph.ttl)
- p.iph.frag_off |= htons(IP_DF);
-
err = ip_tunnel_ioctl(dev, &p, cmd);
if (err)
return err;
diff --git a/net/ipv4/ipmr.c b/net/ipv4/ipmr.c
index 92dd4b74d513..395e2814a46d 100644
--- a/net/ipv4/ipmr.c
+++ b/net/ipv4/ipmr.c
@@ -66,28 +66,7 @@
#include <net/netlink.h>
#include <net/fib_rules.h>
#include <linux/netconf.h>
-
-#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
-#define CONFIG_IP_PIMSM 1
-#endif
-
-struct mr_table {
- struct list_head list;
- possible_net_t net;
- u32 id;
- struct sock __rcu *mroute_sk;
- struct timer_list ipmr_expire_timer;
- struct list_head mfc_unres_queue;
- struct list_head mfc_cache_array[MFC_LINES];
- struct vif_device vif_table[MAXVIFS];
- int maxvif;
- atomic_t cache_resolve_queue_len;
- bool mroute_do_assert;
- bool mroute_do_pim;
-#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
- int mroute_reg_vif_num;
-#endif
-};
+#include <net/nexthop.h>
struct ipmr_rule {
struct fib_rule common;
@@ -103,11 +82,7 @@ struct ipmr_result {
static DEFINE_RWLOCK(mrt_lock);
-/*
- * Multicast router control variables
- */
-
-#define VIF_EXISTS(_mrt, _idx) ((_mrt)->vif_table[_idx].dev != NULL)
+/* Multicast router control variables */
/* Special spinlock for queue of unresolved entries */
static DEFINE_SPINLOCK(mfc_unres_lock);
@@ -134,7 +109,7 @@ static int __ipmr_fill_mroute(struct mr_table *mrt, struct sk_buff *skb,
struct mfc_cache *c, struct rtmsg *rtm);
static void mroute_netlink_event(struct mr_table *mrt, struct mfc_cache *mfc,
int cmd);
-static void mroute_clean_tables(struct mr_table *mrt);
+static void mroute_clean_tables(struct mr_table *mrt, bool all);
static void ipmr_expire_process(unsigned long arg);
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
@@ -252,8 +227,8 @@ static int __net_init ipmr_rules_init(struct net *net)
INIT_LIST_HEAD(&net->ipv4.mr_tables);
mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
- if (!mrt) {
- err = -ENOMEM;
+ if (IS_ERR(mrt)) {
+ err = PTR_ERR(mrt);
goto err1;
}
@@ -301,8 +276,13 @@ static int ipmr_fib_lookup(struct net *net, struct flowi4 *flp4,
static int __net_init ipmr_rules_init(struct net *net)
{
- net->ipv4.mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
- return net->ipv4.mrt ? 0 : -ENOMEM;
+ struct mr_table *mrt;
+
+ mrt = ipmr_new_table(net, RT_TABLE_DEFAULT);
+ if (IS_ERR(mrt))
+ return PTR_ERR(mrt);
+ net->ipv4.mrt = mrt;
+ return 0;
}
static void __net_exit ipmr_rules_exit(struct net *net)
@@ -319,13 +299,17 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
struct mr_table *mrt;
unsigned int i;
+ /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
+ if (id != RT_TABLE_DEFAULT && id >= 1000000000)
+ return ERR_PTR(-EINVAL);
+
mrt = ipmr_get_table(net, id);
if (mrt)
return mrt;
mrt = kzalloc(sizeof(*mrt), GFP_KERNEL);
if (!mrt)
- return NULL;
+ return ERR_PTR(-ENOMEM);
write_pnet(&mrt->net, net);
mrt->id = id;
@@ -338,9 +322,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
setup_timer(&mrt->ipmr_expire_timer, ipmr_expire_process,
(unsigned long)mrt);
-#ifdef CONFIG_IP_PIMSM
mrt->mroute_reg_vif_num = -1;
-#endif
#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
list_add_tail_rcu(&mrt->list, &net->ipv4.mr_tables);
#endif
@@ -350,7 +332,7 @@ static struct mr_table *ipmr_new_table(struct net *net, u32 id)
static void ipmr_free_table(struct mr_table *mrt)
{
del_timer_sync(&mrt->ipmr_expire_timer);
- mroute_clean_tables(mrt);
+ mroute_clean_tables(mrt, true);
kfree(mrt);
}
@@ -387,8 +369,24 @@ static void ipmr_del_tunnel(struct net_device *dev, struct vifctl *v)
}
}
-static
-struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
+/* Initialize ipmr pimreg/tunnel in_device */
+static bool ipmr_init_vif_indev(const struct net_device *dev)
+{
+ struct in_device *in_dev;
+
+ ASSERT_RTNL();
+
+ in_dev = __in_dev_get_rtnl(dev);
+ if (!in_dev)
+ return false;
+ ipv4_devconf_setall(in_dev);
+ neigh_parms_data_state_setall(in_dev->arp_parms);
+ IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
+
+ return true;
+}
+
+static struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
{
struct net_device *dev;
@@ -399,7 +397,6 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
int err;
struct ifreq ifr;
struct ip_tunnel_parm p;
- struct in_device *in_dev;
memset(&p, 0, sizeof(p));
p.iph.daddr = v->vifc_rmt_addr.s_addr;
@@ -424,15 +421,8 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
if (err == 0 &&
(dev = __dev_get_by_name(net, p.name)) != NULL) {
dev->flags |= IFF_MULTICAST;
-
- in_dev = __in_dev_get_rtnl(dev);
- if (!in_dev)
+ if (!ipmr_init_vif_indev(dev))
goto failure;
-
- ipv4_devconf_setall(in_dev);
- neigh_parms_data_state_setall(in_dev->arp_parms);
- IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
-
if (dev_open(dev))
goto failure;
dev_hold(dev);
@@ -441,16 +431,11 @@ struct net_device *ipmr_new_tunnel(struct net *net, struct vifctl *v)
return dev;
failure:
- /* allow the register to be completed before unregistering. */
- rtnl_unlock();
- rtnl_lock();
-
unregister_netdevice(dev);
return NULL;
}
-#ifdef CONFIG_IP_PIMSM
-
+#if defined(CONFIG_IP_PIMSM_V1) || defined(CONFIG_IP_PIMSM_V2)
static netdev_tx_t reg_vif_xmit(struct sk_buff *skb, struct net_device *dev)
{
struct net *net = dev_net(dev);
@@ -500,7 +485,6 @@ static void reg_vif_setup(struct net_device *dev)
static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
{
struct net_device *dev;
- struct in_device *in_dev;
char name[IFNAMSIZ];
if (mrt->id == RT_TABLE_DEFAULT)
@@ -520,18 +504,8 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
return NULL;
}
- rcu_read_lock();
- in_dev = __in_dev_get_rcu(dev);
- if (!in_dev) {
- rcu_read_unlock();
+ if (!ipmr_init_vif_indev(dev))
goto failure;
- }
-
- ipv4_devconf_setall(in_dev);
- neigh_parms_data_state_setall(in_dev->arp_parms);
- IPV4_DEVCONF(in_dev->cnf, RP_FILTER) = 0;
- rcu_read_unlock();
-
if (dev_open(dev))
goto failure;
@@ -540,20 +514,59 @@ static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
return dev;
failure:
- /* allow the register to be completed before unregistering. */
- rtnl_unlock();
- rtnl_lock();
-
unregister_netdevice(dev);
return NULL;
}
+
+/* called with rcu_read_lock() */
+static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
+ unsigned int pimlen)
+{
+ struct net_device *reg_dev = NULL;
+ struct iphdr *encap;
+
+ encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
+ /* Check that:
+ * a. packet is really sent to a multicast group
+ * b. packet is not a NULL-REGISTER
+ * c. packet is not truncated
+ */
+ if (!ipv4_is_multicast(encap->daddr) ||
+ encap->tot_len == 0 ||
+ ntohs(encap->tot_len) + pimlen > skb->len)
+ return 1;
+
+ read_lock(&mrt_lock);
+ if (mrt->mroute_reg_vif_num >= 0)
+ reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
+ read_unlock(&mrt_lock);
+
+ if (!reg_dev)
+ return 1;
+
+ skb->mac_header = skb->network_header;
+ skb_pull(skb, (u8 *)encap - skb->data);
+ skb_reset_network_header(skb);
+ skb->protocol = htons(ETH_P_IP);
+ skb->ip_summed = CHECKSUM_NONE;
+
+ skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
+
+ netif_rx(skb);
+
+ return NET_RX_SUCCESS;
+}
+#else
+static struct net_device *ipmr_reg_vif(struct net *net, struct mr_table *mrt)
+{
+ return NULL;
+}
#endif
/**
* vif_delete - Delete a VIF entry
* @notify: Set to 1, if the caller is a notifier_call
*/
-
static int vif_delete(struct mr_table *mrt, int vifi, int notify,
struct list_head *head)
{
@@ -575,10 +588,8 @@ static int vif_delete(struct mr_table *mrt, int vifi, int notify,
return -EADDRNOTAVAIL;
}
-#ifdef CONFIG_IP_PIMSM
if (vifi == mrt->mroute_reg_vif_num)
mrt->mroute_reg_vif_num = -1;
-#endif
if (vifi + 1 == mrt->maxvif) {
int tmp;
@@ -625,7 +636,6 @@ static inline void ipmr_cache_free(struct mfc_cache *c)
/* Destroy an unresolved cache entry, killing queued skbs
* and reporting error to netlink readers.
*/
-
static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
{
struct net *net = read_pnet(&mrt->net);
@@ -653,9 +663,7 @@ static void ipmr_destroy_unres(struct mr_table *mrt, struct mfc_cache *c)
ipmr_cache_free(c);
}
-
/* Timer process for the unresolved queue. */
-
static void ipmr_expire_process(unsigned long arg)
{
struct mr_table *mrt = (struct mr_table *)arg;
@@ -695,7 +703,6 @@ out:
}
/* Fill oifs list. It is called under write locked mrt_lock. */
-
static void ipmr_update_thresholds(struct mr_table *mrt, struct mfc_cache *cache,
unsigned char *ttls)
{
@@ -731,10 +738,10 @@ static int vif_add(struct net *net, struct mr_table *mrt,
return -EADDRINUSE;
switch (vifc->vifc_flags) {
-#ifdef CONFIG_IP_PIMSM
case VIFF_REGISTER:
- /*
- * Special Purpose VIF in PIM
+ if (!ipmr_pimsm_enabled())
+ return -EINVAL;
+ /* Special Purpose VIF in PIM
* All the packets will be sent to the daemon
*/
if (mrt->mroute_reg_vif_num >= 0)
@@ -749,7 +756,6 @@ static int vif_add(struct net *net, struct mr_table *mrt,
return err;
}
break;
-#endif
case VIFF_TUNNEL:
dev = ipmr_new_tunnel(net, vifc);
if (!dev)
@@ -761,7 +767,6 @@ static int vif_add(struct net *net, struct mr_table *mrt,
return err;
}
break;
-
case VIFF_USE_IFINDEX:
case 0:
if (vifc->vifc_flags == VIFF_USE_IFINDEX) {
@@ -815,10 +820,8 @@ static int vif_add(struct net *net, struct mr_table *mrt,
/* And finish update writing critical data */
write_lock_bh(&mrt_lock);
v->dev = dev;
-#ifdef CONFIG_IP_PIMSM
if (v->flags & VIFF_REGISTER)
mrt->mroute_reg_vif_num = vifi;
-#endif
if (vifi+1 > mrt->maxvif)
mrt->maxvif = vifi+1;
write_unlock_bh(&mrt_lock);
@@ -883,9 +886,7 @@ skip:
return ipmr_cache_find_any_parent(mrt, vifi);
}
-/*
- * Allocate a multicast cache entry
- */
+/* Allocate a multicast cache entry */
static struct mfc_cache *ipmr_cache_alloc(void)
{
struct mfc_cache *c = kmem_cache_zalloc(mrt_cachep, GFP_KERNEL);
@@ -906,10 +907,7 @@ static struct mfc_cache *ipmr_cache_alloc_unres(void)
return c;
}
-/*
- * A cache entry has gone into a resolved state from queued
- */
-
+/* A cache entry has gone into a resolved state from queued */
static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
struct mfc_cache *uc, struct mfc_cache *c)
{
@@ -917,7 +915,6 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
struct nlmsgerr *e;
/* Play the pending entries through our router */
-
while ((skb = __skb_dequeue(&uc->mfc_un.unres.unresolved))) {
if (ip_hdr(skb)->version == 0) {
struct nlmsghdr *nlh = (struct nlmsghdr *)skb_pull(skb, sizeof(struct iphdr));
@@ -941,34 +938,29 @@ static void ipmr_cache_resolve(struct net *net, struct mr_table *mrt,
}
}
-/*
- * Bounce a cache query up to mrouted. We could use netlink for this but mrouted
- * expects the following bizarre scheme.
+/* Bounce a cache query up to mrouted. We could use netlink for this but mrouted
+ * expects the following bizarre scheme.
*
- * Called under mrt_lock.
+ * Called under mrt_lock.
*/
-
static int ipmr_cache_report(struct mr_table *mrt,
struct sk_buff *pkt, vifi_t vifi, int assert)
{
- struct sk_buff *skb;
const int ihl = ip_hdrlen(pkt);
+ struct sock *mroute_sk;
struct igmphdr *igmp;
struct igmpmsg *msg;
- struct sock *mroute_sk;
+ struct sk_buff *skb;
int ret;
-#ifdef CONFIG_IP_PIMSM
if (assert == IGMPMSG_WHOLEPKT)
skb = skb_realloc_headroom(pkt, sizeof(struct iphdr));
else
-#endif
skb = alloc_skb(128, GFP_ATOMIC);
if (!skb)
return -ENOBUFS;
-#ifdef CONFIG_IP_PIMSM
if (assert == IGMPMSG_WHOLEPKT) {
/* Ugly, but we have no choice with this interface.
* Duplicate old header, fix ihl, length etc.
@@ -986,28 +978,23 @@ static int ipmr_cache_report(struct mr_table *mrt,
ip_hdr(skb)->ihl = sizeof(struct iphdr) >> 2;
ip_hdr(skb)->tot_len = htons(ntohs(ip_hdr(pkt)->tot_len) +
sizeof(struct iphdr));
- } else
-#endif
- {
-
- /* Copy the IP header */
-
- skb_set_network_header(skb, skb->len);
- skb_put(skb, ihl);
- skb_copy_to_linear_data(skb, pkt->data, ihl);
- ip_hdr(skb)->protocol = 0; /* Flag to the kernel this is a route add */
- msg = (struct igmpmsg *)skb_network_header(skb);
- msg->im_vif = vifi;
- skb_dst_set(skb, dst_clone(skb_dst(pkt)));
-
- /* Add our header */
-
- igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
- igmp->type =
- msg->im_msgtype = assert;
- igmp->code = 0;
- ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
- skb->transport_header = skb->network_header;
+ } else {
+ /* Copy the IP header */
+ skb_set_network_header(skb, skb->len);
+ skb_put(skb, ihl);
+ skb_copy_to_linear_data(skb, pkt->data, ihl);
+ /* Flag to the kernel this is a route add */
+ ip_hdr(skb)->protocol = 0;
+ msg = (struct igmpmsg *)skb_network_header(skb);
+ msg->im_vif = vifi;
+ skb_dst_set(skb, dst_clone(skb_dst(pkt)));
+ /* Add our header */
+ igmp = (struct igmphdr *)skb_put(skb, sizeof(struct igmphdr));
+ igmp->type = assert;
+ msg->im_msgtype = assert;
+ igmp->code = 0;
+ ip_hdr(skb)->tot_len = htons(skb->len); /* Fix the length */
+ skb->transport_header = skb->network_header;
}
rcu_read_lock();
@@ -1019,7 +1006,6 @@ static int ipmr_cache_report(struct mr_table *mrt,
}
/* Deliver to mrouted */
-
ret = sock_queue_rcv_skb(mroute_sk, skb);
rcu_read_unlock();
if (ret < 0) {
@@ -1030,12 +1016,9 @@ static int ipmr_cache_report(struct mr_table *mrt,
return ret;
}
-/*
- * Queue a packet for resolution. It gets locked cache entry!
- */
-
-static int
-ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
+/* Queue a packet for resolution. It gets locked cache entry! */
+static int ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi,
+ struct sk_buff *skb)
{
bool found = false;
int err;
@@ -1053,7 +1036,6 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
if (!found) {
/* Create a new entry if allowable */
-
if (atomic_read(&mrt->cache_resolve_queue_len) >= 10 ||
(c = ipmr_cache_alloc_unres()) == NULL) {
spin_unlock_bh(&mfc_unres_lock);
@@ -1063,13 +1045,11 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
}
/* Fill in the new cache entry */
-
c->mfc_parent = -1;
c->mfc_origin = iph->saddr;
c->mfc_mcastgrp = iph->daddr;
/* Reflect first query at mrouted. */
-
err = ipmr_cache_report(mrt, skb, vifi, IGMPMSG_NOCACHE);
if (err < 0) {
/* If the report failed throw the cache entry
@@ -1091,7 +1071,6 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
}
/* See if we can append the packet */
-
if (c->mfc_un.unres.unresolved.qlen > 3) {
kfree_skb(skb);
err = -ENOBUFS;
@@ -1104,9 +1083,7 @@ ipmr_cache_unresolved(struct mr_table *mrt, vifi_t vifi, struct sk_buff *skb)
return err;
}
-/*
- * MFC cache manipulation by user space mroute daemon
- */
+/* MFC cache manipulation by user space mroute daemon */
static int ipmr_mfc_delete(struct mr_table *mrt, struct mfcctl *mfc, int parent)
{
@@ -1177,9 +1154,8 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
list_add_rcu(&c->list, &mrt->mfc_cache_array[line]);
- /*
- * Check to see if we resolved a queued list. If so we
- * need to send on the frames and tidy up.
+ /* Check to see if we resolved a queued list. If so we
+ * need to send on the frames and tidy up.
*/
found = false;
spin_lock_bh(&mfc_unres_lock);
@@ -1204,29 +1180,25 @@ static int ipmr_mfc_add(struct net *net, struct mr_table *mrt,
return 0;
}
-/*
- * Close the multicast socket, and clear the vif tables etc
- */
-
-static void mroute_clean_tables(struct mr_table *mrt)
+/* Close the multicast socket, and clear the vif tables etc */
+static void mroute_clean_tables(struct mr_table *mrt, bool all)
{
int i;
LIST_HEAD(list);
struct mfc_cache *c, *next;
/* Shut down all active vif entries */
-
for (i = 0; i < mrt->maxvif; i++) {
- if (!(mrt->vif_table[i].flags & VIFF_STATIC))
- vif_delete(mrt, i, 0, &list);
+ if (!all && (mrt->vif_table[i].flags & VIFF_STATIC))
+ continue;
+ vif_delete(mrt, i, 0, &list);
}
unregister_netdevice_many(&list);
/* Wipe the cache */
-
for (i = 0; i < MFC_LINES; i++) {
list_for_each_entry_safe(c, next, &mrt->mfc_cache_array[i], list) {
- if (c->mfc_flags & MFC_STATIC)
+ if (!all && (c->mfc_flags & MFC_STATIC))
continue;
list_del_rcu(&c->list);
mroute_netlink_event(mrt, c, RTM_DELROUTE);
@@ -1261,50 +1233,58 @@ static void mrtsock_destruct(struct sock *sk)
NETCONFA_IFINDEX_ALL,
net->ipv4.devconf_all);
RCU_INIT_POINTER(mrt->mroute_sk, NULL);
- mroute_clean_tables(mrt);
+ mroute_clean_tables(mrt, false);
}
}
rtnl_unlock();
}
-/*
- * Socket options and virtual interface manipulation. The whole
- * virtual interface system is a complete heap, but unfortunately
- * that's how BSD mrouted happens to think. Maybe one day with a proper
- * MOSPF/PIM router set up we can clean this up.
+/* Socket options and virtual interface manipulation. The whole
+ * virtual interface system is a complete heap, but unfortunately
+ * that's how BSD mrouted happens to think. Maybe one day with a proper
+ * MOSPF/PIM router set up we can clean this up.
*/
-int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsigned int optlen)
+int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval,
+ unsigned int optlen)
{
- int ret, parent = 0;
- struct vifctl vif;
- struct mfcctl mfc;
struct net *net = sock_net(sk);
+ int val, ret = 0, parent = 0;
struct mr_table *mrt;
+ struct vifctl vif;
+ struct mfcctl mfc;
+ u32 uval;
+ /* There's one exception to the lock - MRT_DONE which needs to unlock */
+ rtnl_lock();
if (sk->sk_type != SOCK_RAW ||
- inet_sk(sk)->inet_num != IPPROTO_IGMP)
- return -EOPNOTSUPP;
+ inet_sk(sk)->inet_num != IPPROTO_IGMP) {
+ ret = -EOPNOTSUPP;
+ goto out_unlock;
+ }
mrt = ipmr_get_table(net, raw_sk(sk)->ipmr_table ? : RT_TABLE_DEFAULT);
- if (!mrt)
- return -ENOENT;
-
+ if (!mrt) {
+ ret = -ENOENT;
+ goto out_unlock;
+ }
if (optname != MRT_INIT) {
if (sk != rcu_access_pointer(mrt->mroute_sk) &&
- !ns_capable(net->user_ns, CAP_NET_ADMIN))
- return -EACCES;
+ !ns_capable(net->user_ns, CAP_NET_ADMIN)) {
+ ret = -EACCES;
+ goto out_unlock;
+ }
}
switch (optname) {
case MRT_INIT:
- if (optlen != sizeof(int))
- return -EINVAL;
-
- rtnl_lock();
+ if (optlen != sizeof(int)) {
+ ret = -EINVAL;
+ break;
+ }
if (rtnl_dereference(mrt->mroute_sk)) {
- rtnl_unlock();
- return -EADDRINUSE;
+ ret = -EADDRINUSE;
+ break;
}
ret = ip_ra_control(sk, 1, mrtsock_destruct);
@@ -1315,129 +1295,133 @@ int ip_mroute_setsockopt(struct sock *sk, int optname, char __user *optval, unsi
NETCONFA_IFINDEX_ALL,
net->ipv4.devconf_all);
}
- rtnl_unlock();
- return ret;
+ break;
case MRT_DONE:
- if (sk != rcu_access_pointer(mrt->mroute_sk))
- return -EACCES;
- return ip_ra_control(sk, 0, NULL);
+ if (sk != rcu_access_pointer(mrt->mroute_sk)) {
+ ret = -EACCES;
+ } else {
+ /* We need to unlock here because mrtsock_destruct takes
+ * care of rtnl itself and we can't change that due to
+ * the IP_ROUTER_ALERT setsockopt which runs without it.
+ */
+ rtnl_unlock();
+ ret = ip_ra_control(sk, 0, NULL);
+ goto out;
+ }
+ break;
case MRT_ADD_VIF:
case MRT_DEL_VIF:
- if (optlen != sizeof(vif))
- return -EINVAL;
- if (copy_from_user(&vif, optval, sizeof(vif)))
- return -EFAULT;
- if (vif.vifc_vifi >= MAXVIFS)
- return -ENFILE;
- rtnl_lock();
+ if (optlen != sizeof(vif)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&vif, optval, sizeof(vif))) {
+ ret = -EFAULT;
+ break;
+ }
+ if (vif.vifc_vifi >= MAXVIFS) {
+ ret = -ENFILE;
+ break;
+ }
if (optname == MRT_ADD_VIF) {
ret = vif_add(net, mrt, &vif,
sk == rtnl_dereference(mrt->mroute_sk));
} else {
ret = vif_delete(mrt, vif.vifc_vifi, 0, NULL);
}
- rtnl_unlock();
- return ret;
-
- /*
- * Manipulate the forwarding caches. These live
- * in a sort of kernel/user symbiosis.
- */
+ break;
+ /* Manipulate the forwarding caches. These live
+ * in a sort of kernel/user symbiosis.
+ */
case MRT_ADD_MFC:
case MRT_DEL_MFC:
parent = -1;
case MRT_ADD_MFC_PROXY:
case MRT_DEL_MFC_PROXY:
- if (optlen != sizeof(mfc))
- return -EINVAL;
- if (copy_from_user(&mfc, optval, sizeof(mfc)))
- return -EFAULT;
+ if (optlen != sizeof(mfc)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (copy_from_user(&mfc, optval, sizeof(mfc))) {
+ ret = -EFAULT;
+ break;
+ }
if (parent == 0)
parent = mfc.mfcc_parent;
- rtnl_lock();
if (optname == MRT_DEL_MFC || optname == MRT_DEL_MFC_PROXY)
ret = ipmr_mfc_delete(mrt, &mfc, parent);
else
ret = ipmr_mfc_add(net, mrt, &mfc,
sk == rtnl_dereference(mrt->mroute_sk),
parent);
- rtnl_unlock();
- return ret;
- /*
- * Control PIM assert.
- */
+ break;
+ /* Control PIM assert. */
case MRT_ASSERT:
- {
- int v;
- if (optlen != sizeof(v))
- return -EINVAL;
- if (get_user(v, (int __user *)optval))
- return -EFAULT;
- mrt->mroute_do_assert = v;
- return 0;
- }
-#ifdef CONFIG_IP_PIMSM
+ if (optlen != sizeof(val)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (get_user(val, (int __user *)optval)) {
+ ret = -EFAULT;
+ break;
+ }
+ mrt->mroute_do_assert = val;
+ break;
case MRT_PIM:
- {
- int v;
-
- if (optlen != sizeof(v))
- return -EINVAL;
- if (get_user(v, (int __user *)optval))
- return -EFAULT;
- v = !!v;
+ if (!ipmr_pimsm_enabled()) {
+ ret = -ENOPROTOOPT;
+ break;
+ }
+ if (optlen != sizeof(val)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (get_user(val, (int __user *)optval)) {
+ ret = -EFAULT;
+ break;
+ }
- rtnl_lock();
- ret = 0;
- if (v != mrt->mroute_do_pim) {
- mrt->mroute_do_pim = v;
- mrt->mroute_do_assert = v;
+ val = !!val;
+ if (val != mrt->mroute_do_pim) {
+ mrt->mroute_do_pim = val;
+ mrt->mroute_do_assert = val;
}
- rtnl_unlock();
- return ret;
- }
-#endif
-#ifdef CONFIG_IP_MROUTE_MULTIPLE_TABLES
+ break;
case MRT_TABLE:
- {
- u32 v;
-
- if (optlen != sizeof(u32))
- return -EINVAL;
- if (get_user(v, (u32 __user *)optval))
- return -EFAULT;
-
- /* "pimreg%u" should not exceed 16 bytes (IFNAMSIZ) */
- if (v != RT_TABLE_DEFAULT && v >= 1000000000)
- return -EINVAL;
+ if (!IS_BUILTIN(CONFIG_IP_MROUTE_MULTIPLE_TABLES)) {
+ ret = -ENOPROTOOPT;
+ break;
+ }
+ if (optlen != sizeof(uval)) {
+ ret = -EINVAL;
+ break;
+ }
+ if (get_user(uval, (u32 __user *)optval)) {
+ ret = -EFAULT;
+ break;
+ }
- rtnl_lock();
- ret = 0;
if (sk == rtnl_dereference(mrt->mroute_sk)) {
ret = -EBUSY;
} else {
- if (!ipmr_new_table(net, v))
- ret = -ENOMEM;
+ mrt = ipmr_new_table(net, uval);
+ if (IS_ERR(mrt))
+ ret = PTR_ERR(mrt);
else
- raw_sk(sk)->ipmr_table = v;
+ raw_sk(sk)->ipmr_table = uval;
}
- rtnl_unlock();
- return ret;
- }
-#endif
- /*
- * Spurious command, or MRT_VERSION which you cannot
- * set.
- */
+ break;
+ /* Spurious command, or MRT_VERSION which you cannot set. */
default:
- return -ENOPROTOOPT;
+ ret = -ENOPROTOOPT;
}
+out_unlock:
+ rtnl_unlock();
+out:
+ return ret;
}
-/*
- * Getsock opt support for the multicast routing system.
- */
-
+/* Getsock opt support for the multicast routing system. */
int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int __user *optlen)
{
int olr;
@@ -1453,39 +1437,35 @@ int ip_mroute_getsockopt(struct sock *sk, int optname, char __user *optval, int
if (!mrt)
return -ENOENT;
- if (optname != MRT_VERSION &&
-#ifdef CONFIG_IP_PIMSM
- optname != MRT_PIM &&
-#endif
- optname != MRT_ASSERT)
+ switch (optname) {
+ case MRT_VERSION:
+ val = 0x0305;
+ break;
+ case MRT_PIM:
+ if (!ipmr_pimsm_enabled())
+ return -ENOPROTOOPT;
+ val = mrt->mroute_do_pim;
+ break;
+ case MRT_ASSERT:
+ val = mrt->mroute_do_assert;
+ break;
+ default:
return -ENOPROTOOPT;
+ }
if (get_user(olr, optlen))
return -EFAULT;
-
olr = min_t(unsigned int, olr, sizeof(int));
if (olr < 0)
return -EINVAL;
-
if (put_user(olr, optlen))
return -EFAULT;
- if (optname == MRT_VERSION)
- val = 0x0305;
-#ifdef CONFIG_IP_PIMSM
- else if (optname == MRT_PIM)
- val = mrt->mroute_do_pim;
-#endif
- else
- val = mrt->mroute_do_assert;
if (copy_to_user(optval, &val, olr))
return -EFAULT;
return 0;
}
-/*
- * The IP multicast ioctl support routines.
- */
-
+/* The IP multicast ioctl support routines. */
int ipmr_ioctl(struct sock *sk, int cmd, void __user *arg)
{
struct sioc_sg_req sr;
@@ -1618,7 +1598,6 @@ int ipmr_compat_ioctl(struct sock *sk, unsigned int cmd, void __user *arg)
}
#endif
-
static int ipmr_device_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
@@ -1640,17 +1619,14 @@ static int ipmr_device_event(struct notifier_block *this, unsigned long event, v
return NOTIFY_DONE;
}
-
static struct notifier_block ip_mr_notifier = {
.notifier_call = ipmr_device_event,
};
-/*
- * Encapsulate a packet by attaching a valid IPIP header to it.
- * This avoids tunnel drivers and other mess and gives us the speed so
- * important for multicast video.
+/* Encapsulate a packet by attaching a valid IPIP header to it.
+ * This avoids tunnel drivers and other mess and gives us the speed so
+ * important for multicast video.
*/
-
static void ip_encap(struct net *net, struct sk_buff *skb,
__be32 saddr, __be32 daddr)
{
@@ -1692,9 +1668,7 @@ static inline int ipmr_forward_finish(struct net *net, struct sock *sk,
return dst_output(net, sk, skb);
}
-/*
- * Processing handlers for ipmr_forward
- */
+/* Processing handlers for ipmr_forward */
static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
struct sk_buff *skb, struct mfc_cache *c, int vifi)
@@ -1709,7 +1683,6 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
if (!vif->dev)
goto out_free;
-#ifdef CONFIG_IP_PIMSM
if (vif->flags & VIFF_REGISTER) {
vif->pkt_out++;
vif->bytes_out += skb->len;
@@ -1718,7 +1691,6 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
ipmr_cache_report(mrt, skb, vifi, IGMPMSG_WHOLEPKT);
goto out_free;
}
-#endif
if (vif->flags & VIFF_TUNNEL) {
rt = ip_route_output_ports(net, &fl4, NULL,
@@ -1745,7 +1717,6 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
* allow to send ICMP, so that packets will disappear
* to blackhole.
*/
-
IP_INC_STATS(net, IPSTATS_MIB_FRAGFAILS);
ip_rt_put(rt);
goto out_free;
@@ -1777,8 +1748,7 @@ static void ipmr_queue_xmit(struct net *net, struct mr_table *mrt,
IPCB(skb)->flags |= IPSKB_FORWARDED;
- /*
- * RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
+ /* RFC1584 teaches, that DVMRP/PIM router must deliver packets locally
* not only before forwarding, but after forwarding on all output
* interfaces. It is clear, if mrouter runs a multicasting
* program, it should receive packets not depending to what interface
@@ -1809,7 +1779,6 @@ static int ipmr_find_vif(struct mr_table *mrt, struct net_device *dev)
}
/* "local" means that we should preserve one skb (for local delivery) */
-
static void ip_mr_forward(struct net *net, struct mr_table *mrt,
struct sk_buff *skb, struct mfc_cache *cache,
int local)
@@ -1834,9 +1803,7 @@ static void ip_mr_forward(struct net *net, struct mr_table *mrt,
goto forward;
}
- /*
- * Wrong interface: drop packet and (maybe) send PIM assert.
- */
+ /* Wrong interface: drop packet and (maybe) send PIM assert. */
if (mrt->vif_table[vif].dev != skb->dev) {
if (rt_is_output_route(skb_rtable(skb))) {
/* It is our own packet, looped back.
@@ -1875,9 +1842,7 @@ forward:
mrt->vif_table[vif].pkt_in++;
mrt->vif_table[vif].bytes_in += skb->len;
- /*
- * Forward the frame
- */
+ /* Forward the frame */
if (cache->mfc_origin == htonl(INADDR_ANY) &&
cache->mfc_mcastgrp == htonl(INADDR_ANY)) {
if (true_vifi >= 0 &&
@@ -1951,11 +1916,9 @@ static struct mr_table *ipmr_rt_fib_lookup(struct net *net, struct sk_buff *skb)
return mrt;
}
-/*
- * Multicast packets for forwarding arrive here
- * Called with rcu_read_lock();
+/* Multicast packets for forwarding arrive here
+ * Called with rcu_read_lock();
*/
-
int ip_mr_input(struct sk_buff *skb)
{
struct mfc_cache *cache;
@@ -2006,9 +1969,7 @@ int ip_mr_input(struct sk_buff *skb)
vif);
}
- /*
- * No usable cache entry
- */
+ /* No usable cache entry */
if (!cache) {
int vif;
@@ -2049,53 +2010,8 @@ dont_forward:
return 0;
}
-#ifdef CONFIG_IP_PIMSM
-/* called with rcu_read_lock() */
-static int __pim_rcv(struct mr_table *mrt, struct sk_buff *skb,
- unsigned int pimlen)
-{
- struct net_device *reg_dev = NULL;
- struct iphdr *encap;
-
- encap = (struct iphdr *)(skb_transport_header(skb) + pimlen);
- /*
- * Check that:
- * a. packet is really sent to a multicast group
- * b. packet is not a NULL-REGISTER
- * c. packet is not truncated
- */
- if (!ipv4_is_multicast(encap->daddr) ||
- encap->tot_len == 0 ||
- ntohs(encap->tot_len) + pimlen > skb->len)
- return 1;
-
- read_lock(&mrt_lock);
- if (mrt->mroute_reg_vif_num >= 0)
- reg_dev = mrt->vif_table[mrt->mroute_reg_vif_num].dev;
- read_unlock(&mrt_lock);
-
- if (!reg_dev)
- return 1;
-
- skb->mac_header = skb->network_header;
- skb_pull(skb, (u8 *)encap - skb->data);
- skb_reset_network_header(skb);
- skb->protocol = htons(ETH_P_IP);
- skb->ip_summed = CHECKSUM_NONE;
-
- skb_tunnel_rx(skb, reg_dev, dev_net(reg_dev));
-
- netif_rx(skb);
-
- return NET_RX_SUCCESS;
-}
-#endif
-
#ifdef CONFIG_IP_PIMSM_V1
-/*
- * Handle IGMP messages of PIMv1
- */
-
+/* Handle IGMP messages of PIMv1 */
int pim_rcv_v1(struct sk_buff *skb)
{
struct igmphdr *pim;
@@ -2256,8 +2172,6 @@ int ipmr_get_route(struct net *net, struct sk_buff *skb,
}
read_lock(&mrt_lock);
- if (!nowait && (rtm->rtm_flags & RTM_F_NOTIFY))
- cache->mfc_flags |= MFC_NOTIFY;
err = __ipmr_fill_mroute(mrt, skb, cache, rtm);
read_unlock(&mrt_lock);
rcu_read_unlock();
@@ -2419,10 +2333,133 @@ done:
return skb->len;
}
+static const struct nla_policy rtm_ipmr_policy[RTA_MAX + 1] = {
+ [RTA_SRC] = { .type = NLA_U32 },
+ [RTA_DST] = { .type = NLA_U32 },
+ [RTA_IIF] = { .type = NLA_U32 },
+ [RTA_TABLE] = { .type = NLA_U32 },
+ [RTA_MULTIPATH] = { .len = sizeof(struct rtnexthop) },
+};
+
+static bool ipmr_rtm_validate_proto(unsigned char rtm_protocol)
+{
+ switch (rtm_protocol) {
+ case RTPROT_STATIC:
+ case RTPROT_MROUTED:
+ return true;
+ }
+ return false;
+}
+
+static int ipmr_nla_get_ttls(const struct nlattr *nla, struct mfcctl *mfcc)
+{
+ struct rtnexthop *rtnh = nla_data(nla);
+ int remaining = nla_len(nla), vifi = 0;
+
+ while (rtnh_ok(rtnh, remaining)) {
+ mfcc->mfcc_ttls[vifi] = rtnh->rtnh_hops;
+ if (++vifi == MAXVIFS)
+ break;
+ rtnh = rtnh_next(rtnh, &remaining);
+ }
+
+ return remaining > 0 ? -EINVAL : vifi;
+}
+
+/* returns < 0 on error, 0 for ADD_MFC and 1 for ADD_MFC_PROXY */
+static int rtm_to_ipmr_mfcc(struct net *net, struct nlmsghdr *nlh,
+ struct mfcctl *mfcc, int *mrtsock,
+ struct mr_table **mrtret)
+{
+ struct net_device *dev = NULL;
+ u32 tblid = RT_TABLE_DEFAULT;
+ struct mr_table *mrt;
+ struct nlattr *attr;
+ struct rtmsg *rtm;
+ int ret, rem;
+
+ ret = nlmsg_validate(nlh, sizeof(*rtm), RTA_MAX, rtm_ipmr_policy);
+ if (ret < 0)
+ goto out;
+ rtm = nlmsg_data(nlh);
+
+ ret = -EINVAL;
+ if (rtm->rtm_family != RTNL_FAMILY_IPMR || rtm->rtm_dst_len != 32 ||
+ rtm->rtm_type != RTN_MULTICAST ||
+ rtm->rtm_scope != RT_SCOPE_UNIVERSE ||
+ !ipmr_rtm_validate_proto(rtm->rtm_protocol))
+ goto out;
+
+ memset(mfcc, 0, sizeof(*mfcc));
+ mfcc->mfcc_parent = -1;
+ ret = 0;
+ nlmsg_for_each_attr(attr, nlh, sizeof(struct rtmsg), rem) {
+ switch (nla_type(attr)) {
+ case RTA_SRC:
+ mfcc->mfcc_origin.s_addr = nla_get_be32(attr);
+ break;
+ case RTA_DST:
+ mfcc->mfcc_mcastgrp.s_addr = nla_get_be32(attr);
+ break;
+ case RTA_IIF:
+ dev = __dev_get_by_index(net, nla_get_u32(attr));
+ if (!dev) {
+ ret = -ENODEV;
+ goto out;
+ }
+ break;
+ case RTA_MULTIPATH:
+ if (ipmr_nla_get_ttls(attr, mfcc) < 0) {
+ ret = -EINVAL;
+ goto out;
+ }
+ break;
+ case RTA_PREFSRC:
+ ret = 1;
+ break;
+ case RTA_TABLE:
+ tblid = nla_get_u32(attr);
+ break;
+ }
+ }
+ mrt = ipmr_get_table(net, tblid);
+ if (!mrt) {
+ ret = -ENOENT;
+ goto out;
+ }
+ *mrtret = mrt;
+ *mrtsock = rtm->rtm_protocol == RTPROT_MROUTED ? 1 : 0;
+ if (dev)
+ mfcc->mfcc_parent = ipmr_find_vif(mrt, dev);
+
+out:
+ return ret;
+}
+
+/* takes care of both newroute and delroute */
+static int ipmr_rtm_route(struct sk_buff *skb, struct nlmsghdr *nlh)
+{
+ struct net *net = sock_net(skb->sk);
+ int ret, mrtsock, parent;
+ struct mr_table *tbl;
+ struct mfcctl mfcc;
+
+ mrtsock = 0;
+ tbl = NULL;
+ ret = rtm_to_ipmr_mfcc(net, nlh, &mfcc, &mrtsock, &tbl);
+ if (ret < 0)
+ return ret;
+
+ parent = ret ? mfcc.mfcc_parent : -1;
+ if (nlh->nlmsg_type == RTM_NEWROUTE)
+ return ipmr_mfc_add(net, tbl, &mfcc, mrtsock, parent);
+ else
+ return ipmr_mfc_delete(tbl, &mfcc, parent);
+}
+
#ifdef CONFIG_PROC_FS
-/*
- * The /proc interfaces to multicast routing :
- * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
+/* The /proc interfaces to multicast routing :
+ * /proc/net/ip_mr_cache & /proc/net/ip_mr_vif
*/
struct ipmr_vif_iter {
struct seq_net_private p;
@@ -2706,10 +2743,7 @@ static const struct net_protocol pim_protocol = {
};
#endif
-
-/*
- * Setup for IP multicast routing
- */
+/* Setup for IP multicast routing */
static int __net_init ipmr_net_init(struct net *net)
{
int err;
@@ -2759,8 +2793,6 @@ int __init ip_mr_init(void)
sizeof(struct mfc_cache),
0, SLAB_HWCACHE_ALIGN | SLAB_PANIC,
NULL);
- if (!mrt_cachep)
- return -ENOMEM;
err = register_pernet_subsys(&ipmr_net_ops);
if (err)
@@ -2778,6 +2810,10 @@ int __init ip_mr_init(void)
#endif
rtnl_register(RTNL_FAMILY_IPMR, RTM_GETROUTE,
NULL, ipmr_rtm_dumproute, NULL);
+ rtnl_register(RTNL_FAMILY_IPMR, RTM_NEWROUTE,
+ ipmr_rtm_route, NULL, NULL);
+ rtnl_register(RTNL_FAMILY_IPMR, RTM_DELROUTE,
+ ipmr_rtm_route, NULL, NULL);
return 0;
#ifdef CONFIG_IP_PIMSM_V2
diff --git a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig
index a35584176535..c187c60e3e0c 100644
--- a/net/ipv4/netfilter/Kconfig
+++ b/net/ipv4/netfilter/Kconfig
@@ -60,6 +60,7 @@ config NFT_REJECT_IPV4
config NFT_DUP_IPV4
tristate "IPv4 nf_tables packet duplication support"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
select NF_DUP_IPV4
help
This module enables IPv4 packet duplication support for nf_tables.
diff --git a/net/ipv4/netfilter/arp_tables.c b/net/ipv4/netfilter/arp_tables.c
index 11dccba474b7..4133b0f513af 100644
--- a/net/ipv4/netfilter/arp_tables.c
+++ b/net/ipv4/netfilter/arp_tables.c
@@ -38,13 +38,13 @@ MODULE_DESCRIPTION("arptables core");
/*#define DEBUG_ARP_TABLES_USER*/
#ifdef DEBUG_ARP_TABLES
-#define dprintf(format, args...) printk(format , ## args)
+#define dprintf(format, args...) pr_debug(format, ## args)
#else
#define dprintf(format, args...)
#endif
#ifdef DEBUG_ARP_TABLES_USER
-#define duprintf(format, args...) printk(format , ## args)
+#define duprintf(format, args...) pr_debug(format, ## args)
#else
#define duprintf(format, args...)
#endif
@@ -359,11 +359,12 @@ unsigned int arpt_do_table(struct sk_buff *skb,
}
/* All zeroes == unconditional rule. */
-static inline bool unconditional(const struct arpt_arp *arp)
+static inline bool unconditional(const struct arpt_entry *e)
{
static const struct arpt_arp uncond;
- return memcmp(arp, &uncond, sizeof(uncond)) == 0;
+ return e->target_offset == sizeof(struct arpt_entry) &&
+ memcmp(&e->arp, &uncond, sizeof(uncond)) == 0;
}
/* Figures out from what hook each rule can be called: returns 0 if
@@ -402,11 +403,10 @@ static int mark_source_chains(const struct xt_table_info *newinfo,
|= ((1 << hook) | (1 << NF_ARP_NUMHOOKS));
/* Unconditional return/END. */
- if ((e->target_offset == sizeof(struct arpt_entry) &&
+ if ((unconditional(e) &&
(strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0) &&
- t->verdict < 0 && unconditional(&e->arp)) ||
- visited) {
+ t->verdict < 0) || visited) {
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
@@ -474,14 +474,12 @@ next:
return 1;
}
-static inline int check_entry(const struct arpt_entry *e, const char *name)
+static inline int check_entry(const struct arpt_entry *e)
{
const struct xt_entry_target *t;
- if (!arp_checkentry(&e->arp)) {
- duprintf("arp_tables: arp check failed %p %s.\n", e, name);
+ if (!arp_checkentry(&e->arp))
return -EINVAL;
- }
if (e->target_offset + sizeof(struct xt_entry_target) > e->next_offset)
return -EINVAL;
@@ -522,10 +520,6 @@ find_check_entry(struct arpt_entry *e, const char *name, unsigned int size)
struct xt_target *target;
int ret;
- ret = check_entry(e, name);
- if (ret)
- return ret;
-
e->counters.pcnt = xt_percpu_counter_alloc();
if (IS_ERR_VALUE(e->counters.pcnt))
return -ENOMEM;
@@ -557,7 +551,7 @@ static bool check_underflow(const struct arpt_entry *e)
const struct xt_entry_target *t;
unsigned int verdict;
- if (!unconditional(&e->arp))
+ if (!unconditional(e))
return false;
t = arpt_get_target_c(e);
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
@@ -576,9 +570,11 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
unsigned int valid_hooks)
{
unsigned int h;
+ int err;
if ((unsigned long)e % __alignof__(struct arpt_entry) != 0 ||
- (unsigned char *)e + sizeof(struct arpt_entry) >= limit) {
+ (unsigned char *)e + sizeof(struct arpt_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit) {
duprintf("Bad offset %p\n", e);
return -EINVAL;
}
@@ -590,6 +586,10 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
return -EINVAL;
}
+ err = check_entry(e);
+ if (err)
+ return err;
+
/* Check hooks & underflows */
for (h = 0; h < NF_ARP_NUMHOOKS; h++) {
if (!(valid_hooks & (1 << h)))
@@ -598,9 +598,9 @@ static inline int check_entry_size_and_hooks(struct arpt_entry *e,
newinfo->hook_entry[h] = hook_entries[h];
if ((unsigned char *)e - base == underflows[h]) {
if (!check_underflow(e)) {
- pr_err("Underflows must be unconditional and "
- "use the STANDARD target with "
- "ACCEPT/DROP\n");
+ pr_debug("Underflows must be unconditional and "
+ "use the STANDARD target with "
+ "ACCEPT/DROP\n");
return -EINVAL;
}
newinfo->underflow[h] = underflows[h];
@@ -969,6 +969,7 @@ static int get_entries(struct net *net, struct arpt_get_entries __user *uptr,
sizeof(struct arpt_get_entries) + get.size);
return -EINVAL;
}
+ get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
if (!IS_ERR_OR_NULL(t)) {
@@ -1233,7 +1234,8 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
duprintf("check_compat_entry_size_and_hooks %p\n", e);
if ((unsigned long)e % __alignof__(struct compat_arpt_entry) != 0 ||
- (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit) {
+ (unsigned char *)e + sizeof(struct compat_arpt_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit) {
duprintf("Bad offset %p, limit = %p\n", e, limit);
return -EINVAL;
}
@@ -1246,7 +1248,7 @@ check_compat_entry_size_and_hooks(struct compat_arpt_entry *e,
}
/* For purposes of check_entry casting the compat entry is fine */
- ret = check_entry((struct arpt_entry *)e, name);
+ ret = check_entry((struct arpt_entry *)e);
if (ret)
return ret;
@@ -1662,6 +1664,7 @@ static int compat_get_entries(struct net *net,
*len, sizeof(get) + get.size);
return -EINVAL;
}
+ get.name[sizeof(get.name) - 1] = '\0';
xt_compat_lock(NFPROTO_ARP);
t = xt_find_table_lock(net, NFPROTO_ARP, get.name);
@@ -1780,9 +1783,29 @@ static int do_arpt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len
return ret;
}
-struct xt_table *arpt_register_table(struct net *net,
- const struct xt_table *table,
- const struct arpt_replace *repl)
+static void __arpt_unregister_table(struct xt_table *table)
+{
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct arpt_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries;
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+int arpt_register_table(struct net *net,
+ const struct xt_table *table,
+ const struct arpt_replace *repl,
+ const struct nf_hook_ops *ops,
+ struct xt_table **res)
{
int ret;
struct xt_table_info *newinfo;
@@ -1791,10 +1814,8 @@ struct xt_table *arpt_register_table(struct net *net,
struct xt_table *new_table;
newinfo = xt_alloc_table_info(repl->size);
- if (!newinfo) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!newinfo)
+ return -ENOMEM;
loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
@@ -1809,30 +1830,28 @@ struct xt_table *arpt_register_table(struct net *net,
ret = PTR_ERR(new_table);
goto out_free;
}
- return new_table;
+
+ /* set res now, will see skbs right after nf_register_net_hooks */
+ WRITE_ONCE(*res, new_table);
+
+ ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
+ if (ret != 0) {
+ __arpt_unregister_table(new_table);
+ *res = NULL;
+ }
+
+ return ret;
out_free:
xt_free_table_info(newinfo);
-out:
- return ERR_PTR(ret);
+ return ret;
}
-void arpt_unregister_table(struct xt_table *table)
+void arpt_unregister_table(struct net *net, struct xt_table *table,
+ const struct nf_hook_ops *ops)
{
- struct xt_table_info *private;
- void *loc_cpu_entry;
- struct module *table_owner = table->me;
- struct arpt_entry *iter;
-
- private = xt_unregister_table(table);
-
- /* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries;
- xt_entry_foreach(iter, loc_cpu_entry, private->size)
- cleanup_entry(iter);
- if (private->number > private->initial_entries)
- module_put(table_owner);
- xt_free_table_info(private);
+ nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+ __arpt_unregister_table(table);
}
/* The built-in targets: standard (NULL) and error. */
@@ -1905,7 +1924,7 @@ static int __init arp_tables_init(void)
if (ret < 0)
goto err4;
- printk(KERN_INFO "arp_tables: (C) 2002 David S. Miller\n");
+ pr_info("arp_tables: (C) 2002 David S. Miller\n");
return 0;
err4:
diff --git a/net/ipv4/netfilter/arptable_filter.c b/net/ipv4/netfilter/arptable_filter.c
index 1897ee160920..dd8c80dc32a2 100644
--- a/net/ipv4/netfilter/arptable_filter.c
+++ b/net/ipv4/netfilter/arptable_filter.c
@@ -17,12 +17,15 @@ MODULE_DESCRIPTION("arptables filter table");
#define FILTER_VALID_HOOKS ((1 << NF_ARP_IN) | (1 << NF_ARP_OUT) | \
(1 << NF_ARP_FORWARD))
+static int __net_init arptable_filter_table_init(struct net *net);
+
static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_ARP,
.priority = NF_IP_PRI_FILTER,
+ .table_init = arptable_filter_table_init,
};
/* The work comes in here from netfilter.c */
@@ -35,26 +38,32 @@ arptable_filter_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *arpfilter_ops __read_mostly;
-static int __net_init arptable_filter_net_init(struct net *net)
+static int __net_init arptable_filter_table_init(struct net *net)
{
struct arpt_replace *repl;
-
+ int err;
+
+ if (net->ipv4.arptable_filter)
+ return 0;
+
repl = arpt_alloc_initial_table(&packet_filter);
if (repl == NULL)
return -ENOMEM;
- net->ipv4.arptable_filter =
- arpt_register_table(net, &packet_filter, repl);
+ err = arpt_register_table(net, &packet_filter, repl, arpfilter_ops,
+ &net->ipv4.arptable_filter);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.arptable_filter);
+ return err;
}
static void __net_exit arptable_filter_net_exit(struct net *net)
{
- arpt_unregister_table(net->ipv4.arptable_filter);
+ if (!net->ipv4.arptable_filter)
+ return;
+ arpt_unregister_table(net, net->ipv4.arptable_filter, arpfilter_ops);
+ net->ipv4.arptable_filter = NULL;
}
static struct pernet_operations arptable_filter_net_ops = {
- .init = arptable_filter_net_init,
.exit = arptable_filter_net_exit,
};
@@ -62,26 +71,23 @@ static int __init arptable_filter_init(void)
{
int ret;
+ arpfilter_ops = xt_hook_ops_alloc(&packet_filter, arptable_filter_hook);
+ if (IS_ERR(arpfilter_ops))
+ return PTR_ERR(arpfilter_ops);
+
ret = register_pernet_subsys(&arptable_filter_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(arpfilter_ops);
return ret;
-
- arpfilter_ops = xt_hook_link(&packet_filter, arptable_filter_hook);
- if (IS_ERR(arpfilter_ops)) {
- ret = PTR_ERR(arpfilter_ops);
- goto cleanup_table;
}
- return ret;
-cleanup_table:
- unregister_pernet_subsys(&arptable_filter_net_ops);
return ret;
}
static void __exit arptable_filter_fini(void)
{
- xt_hook_unlink(&packet_filter, arpfilter_ops);
unregister_pernet_subsys(&arptable_filter_net_ops);
+ kfree(arpfilter_ops);
}
module_init(arptable_filter_init);
diff --git a/net/ipv4/netfilter/ip_tables.c b/net/ipv4/netfilter/ip_tables.c
index b99affad6ba1..631c100a1338 100644
--- a/net/ipv4/netfilter/ip_tables.c
+++ b/net/ipv4/netfilter/ip_tables.c
@@ -168,11 +168,12 @@ get_entry(const void *base, unsigned int offset)
/* All zeroes == unconditional rule. */
/* Mildly perf critical (only if packet tracing is on) */
-static inline bool unconditional(const struct ipt_ip *ip)
+static inline bool unconditional(const struct ipt_entry *e)
{
static const struct ipt_ip uncond;
- return memcmp(ip, &uncond, sizeof(uncond)) == 0;
+ return e->target_offset == sizeof(struct ipt_entry) &&
+ memcmp(&e->ip, &uncond, sizeof(uncond)) == 0;
#undef FWINV
}
@@ -229,11 +230,10 @@ get_chainname_rulenum(const struct ipt_entry *s, const struct ipt_entry *e,
} else if (s == e) {
(*rulenum)++;
- if (s->target_offset == sizeof(struct ipt_entry) &&
+ if (unconditional(s) &&
strcmp(t->target.u.kernel.target->name,
XT_STANDARD_TARGET) == 0 &&
- t->verdict < 0 &&
- unconditional(&s->ip)) {
+ t->verdict < 0) {
/* Tail of chains: STANDARD target (return/policy) */
*comment = *chainname == hookname
? comments[NF_IP_TRACE_COMMENT_POLICY]
@@ -476,11 +476,10 @@ mark_source_chains(const struct xt_table_info *newinfo,
e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
/* Unconditional return/END. */
- if ((e->target_offset == sizeof(struct ipt_entry) &&
+ if ((unconditional(e) &&
(strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0) &&
- t->verdict < 0 && unconditional(&e->ip)) ||
- visited) {
+ t->verdict < 0) || visited) {
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
@@ -569,14 +568,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net)
}
static int
-check_entry(const struct ipt_entry *e, const char *name)
+check_entry(const struct ipt_entry *e)
{
const struct xt_entry_target *t;
- if (!ip_checkentry(&e->ip)) {
- duprintf("ip check failed %p %s.\n", e, name);
+ if (!ip_checkentry(&e->ip))
return -EINVAL;
- }
if (e->target_offset + sizeof(struct xt_entry_target) >
e->next_offset)
@@ -666,10 +663,6 @@ find_check_entry(struct ipt_entry *e, struct net *net, const char *name,
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
- ret = check_entry(e, name);
- if (ret)
- return ret;
-
e->counters.pcnt = xt_percpu_counter_alloc();
if (IS_ERR_VALUE(e->counters.pcnt))
return -ENOMEM;
@@ -721,7 +714,7 @@ static bool check_underflow(const struct ipt_entry *e)
const struct xt_entry_target *t;
unsigned int verdict;
- if (!unconditional(&e->ip))
+ if (!unconditional(e))
return false;
t = ipt_get_target_c(e);
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
@@ -741,9 +734,11 @@ check_entry_size_and_hooks(struct ipt_entry *e,
unsigned int valid_hooks)
{
unsigned int h;
+ int err;
if ((unsigned long)e % __alignof__(struct ipt_entry) != 0 ||
- (unsigned char *)e + sizeof(struct ipt_entry) >= limit) {
+ (unsigned char *)e + sizeof(struct ipt_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit) {
duprintf("Bad offset %p\n", e);
return -EINVAL;
}
@@ -755,6 +750,10 @@ check_entry_size_and_hooks(struct ipt_entry *e,
return -EINVAL;
}
+ err = check_entry(e);
+ if (err)
+ return err;
+
/* Check hooks & underflows */
for (h = 0; h < NF_INET_NUMHOOKS; h++) {
if (!(valid_hooks & (1 << h)))
@@ -763,9 +762,9 @@ check_entry_size_and_hooks(struct ipt_entry *e,
newinfo->hook_entry[h] = hook_entries[h];
if ((unsigned char *)e - base == underflows[h]) {
if (!check_underflow(e)) {
- pr_err("Underflows must be unconditional and "
- "use the STANDARD target with "
- "ACCEPT/DROP\n");
+ pr_debug("Underflows must be unconditional and "
+ "use the STANDARD target with "
+ "ACCEPT/DROP\n");
return -EINVAL;
}
newinfo->underflow[h] = underflows[h];
@@ -1157,6 +1156,7 @@ get_entries(struct net *net, struct ipt_get_entries __user *uptr,
*len, sizeof(get) + get.size);
return -EINVAL;
}
+ get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, AF_INET, get.name);
if (!IS_ERR_OR_NULL(t)) {
@@ -1493,7 +1493,8 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
duprintf("check_compat_entry_size_and_hooks %p\n", e);
if ((unsigned long)e % __alignof__(struct compat_ipt_entry) != 0 ||
- (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit) {
+ (unsigned char *)e + sizeof(struct compat_ipt_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit) {
duprintf("Bad offset %p, limit = %p\n", e, limit);
return -EINVAL;
}
@@ -1506,7 +1507,7 @@ check_compat_entry_size_and_hooks(struct compat_ipt_entry *e,
}
/* For purposes of check_entry casting the compat entry is fine */
- ret = check_entry((struct ipt_entry *)e, name);
+ ret = check_entry((struct ipt_entry *)e);
if (ret)
return ret;
@@ -1935,6 +1936,7 @@ compat_get_entries(struct net *net, struct compat_ipt_get_entries __user *uptr,
*len, sizeof(get) + get.size);
return -EINVAL;
}
+ get.name[sizeof(get.name) - 1] = '\0';
xt_compat_lock(AF_INET);
t = xt_find_table_lock(net, AF_INET, get.name);
@@ -2062,9 +2064,27 @@ do_ipt_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
return ret;
}
-struct xt_table *ipt_register_table(struct net *net,
- const struct xt_table *table,
- const struct ipt_replace *repl)
+static void __ipt_unregister_table(struct net *net, struct xt_table *table)
+{
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct ipt_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries;
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter, net);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+int ipt_register_table(struct net *net, const struct xt_table *table,
+ const struct ipt_replace *repl,
+ const struct nf_hook_ops *ops, struct xt_table **res)
{
int ret;
struct xt_table_info *newinfo;
@@ -2073,10 +2093,8 @@ struct xt_table *ipt_register_table(struct net *net,
struct xt_table *new_table;
newinfo = xt_alloc_table_info(repl->size);
- if (!newinfo) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!newinfo)
+ return -ENOMEM;
loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
@@ -2091,30 +2109,27 @@ struct xt_table *ipt_register_table(struct net *net,
goto out_free;
}
- return new_table;
+ /* set res now, will see skbs right after nf_register_net_hooks */
+ WRITE_ONCE(*res, new_table);
+
+ ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
+ if (ret != 0) {
+ __ipt_unregister_table(net, new_table);
+ *res = NULL;
+ }
+
+ return ret;
out_free:
xt_free_table_info(newinfo);
-out:
- return ERR_PTR(ret);
+ return ret;
}
-void ipt_unregister_table(struct net *net, struct xt_table *table)
+void ipt_unregister_table(struct net *net, struct xt_table *table,
+ const struct nf_hook_ops *ops)
{
- struct xt_table_info *private;
- void *loc_cpu_entry;
- struct module *table_owner = table->me;
- struct ipt_entry *iter;
-
- private = xt_unregister_table(table);
-
- /* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries;
- xt_entry_foreach(iter, loc_cpu_entry, private->size)
- cleanup_entry(iter, net);
- if (private->number > private->initial_entries)
- module_put(table_owner);
- xt_free_table_info(private);
+ nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+ __ipt_unregister_table(net, table);
}
/* Returns 1 if the type and code is matched by the range, 0 otherwise */
diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c
index 5fdc556514ba..db5b87509446 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -18,7 +18,8 @@
#include <net/netfilter/nf_conntrack_synproxy.h>
static struct iphdr *
-synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)
+synproxy_build_ip(struct net *net, struct sk_buff *skb, __be32 saddr,
+ __be32 daddr)
{
struct iphdr *iph;
@@ -29,7 +30,7 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)
iph->tos = 0;
iph->id = 0;
iph->frag_off = htons(IP_DF);
- iph->ttl = sysctl_ip_default_ttl;
+ iph->ttl = net->ipv4.sysctl_ip_default_ttl;
iph->protocol = IPPROTO_TCP;
iph->check = 0;
iph->saddr = saddr;
@@ -39,14 +40,12 @@ synproxy_build_ip(struct sk_buff *skb, __be32 saddr, __be32 daddr)
}
static void
-synproxy_send_tcp(const struct synproxy_net *snet,
+synproxy_send_tcp(struct net *net,
const struct sk_buff *skb, struct sk_buff *nskb,
struct nf_conntrack *nfct, enum ip_conntrack_info ctinfo,
struct iphdr *niph, struct tcphdr *nth,
unsigned int tcp_hdr_size)
{
- struct net *net = nf_ct_net(snet->tmpl);
-
nth->check = ~tcp_v4_check(tcp_hdr_size, niph->saddr, niph->daddr, 0);
nskb->ip_summed = CHECKSUM_PARTIAL;
nskb->csum_start = (unsigned char *)nth - nskb->head;
@@ -71,7 +70,7 @@ free_nskb:
}
static void
-synproxy_send_client_synack(const struct synproxy_net *snet,
+synproxy_send_client_synack(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
{
@@ -90,7 +89,7 @@ synproxy_send_client_synack(const struct synproxy_net *snet,
return;
skb_reserve(nskb, MAX_TCP_HEADER);
- niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+ niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
skb_reset_transport_header(nskb);
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
@@ -108,15 +107,16 @@ synproxy_send_client_synack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
niph, nth, tcp_hdr_size);
}
static void
-synproxy_send_server_syn(const struct synproxy_net *snet,
+synproxy_send_server_syn(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts, u32 recv_seq)
{
+ struct synproxy_net *snet = synproxy_pernet(net);
struct sk_buff *nskb;
struct iphdr *iph, *niph;
struct tcphdr *nth;
@@ -131,7 +131,7 @@ synproxy_send_server_syn(const struct synproxy_net *snet,
return;
skb_reserve(nskb, MAX_TCP_HEADER);
- niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+ niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
skb_reset_transport_header(nskb);
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
@@ -152,12 +152,12 @@ synproxy_send_server_syn(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(snet, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
+ synproxy_send_tcp(net, skb, nskb, &snet->tmpl->ct_general, IP_CT_NEW,
niph, nth, tcp_hdr_size);
}
static void
-synproxy_send_server_ack(const struct synproxy_net *snet,
+synproxy_send_server_ack(struct net *net,
const struct ip_ct_tcp *state,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
@@ -176,7 +176,7 @@ synproxy_send_server_ack(const struct synproxy_net *snet,
return;
skb_reserve(nskb, MAX_TCP_HEADER);
- niph = synproxy_build_ip(nskb, iph->daddr, iph->saddr);
+ niph = synproxy_build_ip(net, nskb, iph->daddr, iph->saddr);
skb_reset_transport_header(nskb);
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
@@ -192,11 +192,11 @@ synproxy_send_server_ack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(snet, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+ synproxy_send_tcp(net, skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
}
static void
-synproxy_send_client_ack(const struct synproxy_net *snet,
+synproxy_send_client_ack(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
const struct synproxy_options *opts)
{
@@ -214,7 +214,7 @@ synproxy_send_client_ack(const struct synproxy_net *snet,
return;
skb_reserve(nskb, MAX_TCP_HEADER);
- niph = synproxy_build_ip(nskb, iph->saddr, iph->daddr);
+ niph = synproxy_build_ip(net, nskb, iph->saddr, iph->daddr);
skb_reset_transport_header(nskb);
nth = (struct tcphdr *)skb_put(nskb, tcp_hdr_size);
@@ -230,15 +230,16 @@ synproxy_send_client_ack(const struct synproxy_net *snet,
synproxy_build_options(nth, opts);
- synproxy_send_tcp(snet, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
+ synproxy_send_tcp(net, skb, nskb, skb->nfct, IP_CT_ESTABLISHED_REPLY,
niph, nth, tcp_hdr_size);
}
static bool
-synproxy_recv_client_ack(const struct synproxy_net *snet,
+synproxy_recv_client_ack(struct net *net,
const struct sk_buff *skb, const struct tcphdr *th,
struct synproxy_options *opts, u32 recv_seq)
{
+ struct synproxy_net *snet = synproxy_pernet(net);
int mss;
mss = __cookie_v4_check(ip_hdr(skb), th, ntohl(th->ack_seq) - 1);
@@ -254,7 +255,7 @@ synproxy_recv_client_ack(const struct synproxy_net *snet,
if (opts->options & XT_SYNPROXY_OPT_TIMESTAMP)
synproxy_check_timestamp_cookie(opts);
- synproxy_send_server_syn(snet, skb, th, opts, recv_seq);
+ synproxy_send_server_syn(net, skb, th, opts, recv_seq);
return true;
}
@@ -262,7 +263,8 @@ static unsigned int
synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_synproxy_info *info = par->targinfo;
- struct synproxy_net *snet = synproxy_pernet(par->net);
+ struct net *net = par->net;
+ struct synproxy_net *snet = synproxy_pernet(net);
struct synproxy_options opts = {};
struct tcphdr *th, _th;
@@ -291,12 +293,12 @@ synproxy_tg4(struct sk_buff *skb, const struct xt_action_param *par)
XT_SYNPROXY_OPT_SACK_PERM |
XT_SYNPROXY_OPT_ECN);
- synproxy_send_client_synack(snet, skb, th, &opts);
+ synproxy_send_client_synack(net, skb, th, &opts);
return NF_DROP;
} else if (th->ack && !(th->fin || th->rst || th->syn)) {
/* ACK from client */
- synproxy_recv_client_ack(snet, skb, th, &opts, ntohl(th->seq));
+ synproxy_recv_client_ack(net, skb, th, &opts, ntohl(th->seq));
return NF_DROP;
}
@@ -307,7 +309,8 @@ static unsigned int ipv4_synproxy_hook(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *nhs)
{
- struct synproxy_net *snet = synproxy_pernet(nhs->net);
+ struct net *net = nhs->net;
+ struct synproxy_net *snet = synproxy_pernet(net);
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
struct nf_conn_synproxy *synproxy;
@@ -364,7 +367,7 @@ static unsigned int ipv4_synproxy_hook(void *priv,
* therefore we need to add 1 to make the SYN sequence
* number match the one of first SYN.
*/
- if (synproxy_recv_client_ack(snet, skb, th, &opts,
+ if (synproxy_recv_client_ack(net, skb, th, &opts,
ntohl(th->seq) + 1))
this_cpu_inc(snet->stats->cookie_retrans);
@@ -390,12 +393,12 @@ static unsigned int ipv4_synproxy_hook(void *priv,
XT_SYNPROXY_OPT_SACK_PERM);
swap(opts.tsval, opts.tsecr);
- synproxy_send_server_ack(snet, state, skb, th, &opts);
+ synproxy_send_server_ack(net, state, skb, th, &opts);
nf_ct_seqadj_init(ct, ctinfo, synproxy->isn - ntohl(th->seq));
swap(opts.tsval, opts.tsecr);
- synproxy_send_client_ack(snet, skb, th, &opts);
+ synproxy_send_client_ack(net, skb, th, &opts);
consume_skb(skb);
return NF_STOLEN;
diff --git a/net/ipv4/netfilter/iptable_filter.c b/net/ipv4/netfilter/iptable_filter.c
index 397ef2dd133e..7667f223d7f8 100644
--- a/net/ipv4/netfilter/iptable_filter.c
+++ b/net/ipv4/netfilter/iptable_filter.c
@@ -23,6 +23,7 @@ MODULE_DESCRIPTION("iptables filter table");
#define FILTER_VALID_HOOKS ((1 << NF_INET_LOCAL_IN) | \
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT))
+static int __net_init iptable_filter_table_init(struct net *net);
static const struct xt_table packet_filter = {
.name = "filter",
@@ -30,6 +31,7 @@ static const struct xt_table packet_filter = {
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_FILTER,
+ .table_init = iptable_filter_table_init,
};
static unsigned int
@@ -48,12 +50,16 @@ iptable_filter_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *filter_ops __read_mostly;
/* Default to forward because I got too much mail already. */
-static bool forward = true;
+static bool forward __read_mostly = true;
module_param(forward, bool, 0000);
-static int __net_init iptable_filter_net_init(struct net *net)
+static int __net_init iptable_filter_table_init(struct net *net)
{
struct ipt_replace *repl;
+ int err;
+
+ if (net->ipv4.iptable_filter)
+ return 0;
repl = ipt_alloc_initial_table(&packet_filter);
if (repl == NULL)
@@ -62,15 +68,26 @@ static int __net_init iptable_filter_net_init(struct net *net)
((struct ipt_standard *)repl->entries)[1].target.verdict =
forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
- net->ipv4.iptable_filter =
- ipt_register_table(net, &packet_filter, repl);
+ err = ipt_register_table(net, &packet_filter, repl, filter_ops,
+ &net->ipv4.iptable_filter);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.iptable_filter);
+ return err;
+}
+
+static int __net_init iptable_filter_net_init(struct net *net)
+{
+ if (net == &init_net || !forward)
+ return iptable_filter_table_init(net);
+
+ return 0;
}
static void __net_exit iptable_filter_net_exit(struct net *net)
{
- ipt_unregister_table(net, net->ipv4.iptable_filter);
+ if (!net->ipv4.iptable_filter)
+ return;
+ ipt_unregister_table(net, net->ipv4.iptable_filter, filter_ops);
+ net->ipv4.iptable_filter = NULL;
}
static struct pernet_operations iptable_filter_net_ops = {
@@ -82,24 +99,21 @@ static int __init iptable_filter_init(void)
{
int ret;
+ filter_ops = xt_hook_ops_alloc(&packet_filter, iptable_filter_hook);
+ if (IS_ERR(filter_ops))
+ return PTR_ERR(filter_ops);
+
ret = register_pernet_subsys(&iptable_filter_net_ops);
if (ret < 0)
- return ret;
-
- /* Register hooks */
- filter_ops = xt_hook_link(&packet_filter, iptable_filter_hook);
- if (IS_ERR(filter_ops)) {
- ret = PTR_ERR(filter_ops);
- unregister_pernet_subsys(&iptable_filter_net_ops);
- }
+ kfree(filter_ops);
return ret;
}
static void __exit iptable_filter_fini(void)
{
- xt_hook_unlink(&packet_filter, filter_ops);
unregister_pernet_subsys(&iptable_filter_net_ops);
+ kfree(filter_ops);
}
module_init(iptable_filter_init);
diff --git a/net/ipv4/netfilter/iptable_mangle.c b/net/ipv4/netfilter/iptable_mangle.c
index ba5d392a13c4..57fc97cdac70 100644
--- a/net/ipv4/netfilter/iptable_mangle.c
+++ b/net/ipv4/netfilter/iptable_mangle.c
@@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables mangle table");
(1 << NF_INET_LOCAL_OUT) | \
(1 << NF_INET_POST_ROUTING))
+static int __net_init iptable_mangle_table_init(struct net *net);
+
static const struct xt_table packet_mangler = {
.name = "mangle",
.valid_hooks = MANGLE_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_MANGLE,
+ .table_init = iptable_mangle_table_init,
};
static unsigned int
@@ -92,27 +95,32 @@ iptable_mangle_hook(void *priv,
}
static struct nf_hook_ops *mangle_ops __read_mostly;
-
-static int __net_init iptable_mangle_net_init(struct net *net)
+static int __net_init iptable_mangle_table_init(struct net *net)
{
struct ipt_replace *repl;
+ int ret;
+
+ if (net->ipv4.iptable_mangle)
+ return 0;
repl = ipt_alloc_initial_table(&packet_mangler);
if (repl == NULL)
return -ENOMEM;
- net->ipv4.iptable_mangle =
- ipt_register_table(net, &packet_mangler, repl);
+ ret = ipt_register_table(net, &packet_mangler, repl, mangle_ops,
+ &net->ipv4.iptable_mangle);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.iptable_mangle);
+ return ret;
}
static void __net_exit iptable_mangle_net_exit(struct net *net)
{
- ipt_unregister_table(net, net->ipv4.iptable_mangle);
+ if (!net->ipv4.iptable_mangle)
+ return;
+ ipt_unregister_table(net, net->ipv4.iptable_mangle, mangle_ops);
+ net->ipv4.iptable_mangle = NULL;
}
static struct pernet_operations iptable_mangle_net_ops = {
- .init = iptable_mangle_net_init,
.exit = iptable_mangle_net_exit,
};
@@ -120,15 +128,22 @@ static int __init iptable_mangle_init(void)
{
int ret;
+ mangle_ops = xt_hook_ops_alloc(&packet_mangler, iptable_mangle_hook);
+ if (IS_ERR(mangle_ops)) {
+ ret = PTR_ERR(mangle_ops);
+ return ret;
+ }
+
ret = register_pernet_subsys(&iptable_mangle_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(mangle_ops);
return ret;
+ }
- /* Register hooks */
- mangle_ops = xt_hook_link(&packet_mangler, iptable_mangle_hook);
- if (IS_ERR(mangle_ops)) {
- ret = PTR_ERR(mangle_ops);
+ ret = iptable_mangle_table_init(&init_net);
+ if (ret) {
unregister_pernet_subsys(&iptable_mangle_net_ops);
+ kfree(mangle_ops);
}
return ret;
@@ -136,8 +151,8 @@ static int __init iptable_mangle_init(void)
static void __exit iptable_mangle_fini(void)
{
- xt_hook_unlink(&packet_mangler, mangle_ops);
unregister_pernet_subsys(&iptable_mangle_net_ops);
+ kfree(mangle_ops);
}
module_init(iptable_mangle_init);
diff --git a/net/ipv4/netfilter/iptable_nat.c b/net/ipv4/netfilter/iptable_nat.c
index ae2cd2752046..138a24bc76ad 100644
--- a/net/ipv4/netfilter/iptable_nat.c
+++ b/net/ipv4/netfilter/iptable_nat.c
@@ -18,6 +18,8 @@
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_l3proto.h>
+static int __net_init iptable_nat_table_init(struct net *net);
+
static const struct xt_table nf_nat_ipv4_table = {
.name = "nat",
.valid_hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -26,6 +28,7 @@ static const struct xt_table nf_nat_ipv4_table = {
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
+ .table_init = iptable_nat_table_init,
};
static unsigned int iptable_nat_do_chain(void *priv,
@@ -95,50 +98,50 @@ static struct nf_hook_ops nf_nat_ipv4_ops[] __read_mostly = {
},
};
-static int __net_init iptable_nat_net_init(struct net *net)
+static int __net_init iptable_nat_table_init(struct net *net)
{
struct ipt_replace *repl;
+ int ret;
+
+ if (net->ipv4.nat_table)
+ return 0;
repl = ipt_alloc_initial_table(&nf_nat_ipv4_table);
if (repl == NULL)
return -ENOMEM;
- net->ipv4.nat_table = ipt_register_table(net, &nf_nat_ipv4_table, repl);
+ ret = ipt_register_table(net, &nf_nat_ipv4_table, repl,
+ nf_nat_ipv4_ops, &net->ipv4.nat_table);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.nat_table);
+ return ret;
}
static void __net_exit iptable_nat_net_exit(struct net *net)
{
- ipt_unregister_table(net, net->ipv4.nat_table);
+ if (!net->ipv4.nat_table)
+ return;
+ ipt_unregister_table(net, net->ipv4.nat_table, nf_nat_ipv4_ops);
+ net->ipv4.nat_table = NULL;
}
static struct pernet_operations iptable_nat_net_ops = {
- .init = iptable_nat_net_init,
.exit = iptable_nat_net_exit,
};
static int __init iptable_nat_init(void)
{
- int err;
+ int ret = register_pernet_subsys(&iptable_nat_net_ops);
- err = register_pernet_subsys(&iptable_nat_net_ops);
- if (err < 0)
- goto err1;
+ if (ret)
+ return ret;
- err = nf_register_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
- if (err < 0)
- goto err2;
- return 0;
-
-err2:
- unregister_pernet_subsys(&iptable_nat_net_ops);
-err1:
- return err;
+ ret = iptable_nat_table_init(&init_net);
+ if (ret)
+ unregister_pernet_subsys(&iptable_nat_net_ops);
+ return ret;
}
static void __exit iptable_nat_exit(void)
{
- nf_unregister_hooks(nf_nat_ipv4_ops, ARRAY_SIZE(nf_nat_ipv4_ops));
unregister_pernet_subsys(&iptable_nat_net_ops);
}
diff --git a/net/ipv4/netfilter/iptable_raw.c b/net/ipv4/netfilter/iptable_raw.c
index 1ba02811acb0..2642ecd2645c 100644
--- a/net/ipv4/netfilter/iptable_raw.c
+++ b/net/ipv4/netfilter/iptable_raw.c
@@ -10,12 +10,15 @@
#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
+static int __net_init iptable_raw_table_init(struct net *net);
+
static const struct xt_table packet_raw = {
.name = "raw",
.valid_hooks = RAW_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_RAW,
+ .table_init = iptable_raw_table_init,
};
/* The work comes in here from netfilter.c. */
@@ -34,26 +37,32 @@ iptable_raw_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *rawtable_ops __read_mostly;
-static int __net_init iptable_raw_net_init(struct net *net)
+static int __net_init iptable_raw_table_init(struct net *net)
{
struct ipt_replace *repl;
+ int ret;
+
+ if (net->ipv4.iptable_raw)
+ return 0;
repl = ipt_alloc_initial_table(&packet_raw);
if (repl == NULL)
return -ENOMEM;
- net->ipv4.iptable_raw =
- ipt_register_table(net, &packet_raw, repl);
+ ret = ipt_register_table(net, &packet_raw, repl, rawtable_ops,
+ &net->ipv4.iptable_raw);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.iptable_raw);
+ return ret;
}
static void __net_exit iptable_raw_net_exit(struct net *net)
{
- ipt_unregister_table(net, net->ipv4.iptable_raw);
+ if (!net->ipv4.iptable_raw)
+ return;
+ ipt_unregister_table(net, net->ipv4.iptable_raw, rawtable_ops);
+ net->ipv4.iptable_raw = NULL;
}
static struct pernet_operations iptable_raw_net_ops = {
- .init = iptable_raw_net_init,
.exit = iptable_raw_net_exit,
};
@@ -61,15 +70,20 @@ static int __init iptable_raw_init(void)
{
int ret;
+ rawtable_ops = xt_hook_ops_alloc(&packet_raw, iptable_raw_hook);
+ if (IS_ERR(rawtable_ops))
+ return PTR_ERR(rawtable_ops);
+
ret = register_pernet_subsys(&iptable_raw_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(rawtable_ops);
return ret;
+ }
- /* Register hooks */
- rawtable_ops = xt_hook_link(&packet_raw, iptable_raw_hook);
- if (IS_ERR(rawtable_ops)) {
- ret = PTR_ERR(rawtable_ops);
+ ret = iptable_raw_table_init(&init_net);
+ if (ret) {
unregister_pernet_subsys(&iptable_raw_net_ops);
+ kfree(rawtable_ops);
}
return ret;
@@ -77,8 +91,8 @@ static int __init iptable_raw_init(void)
static void __exit iptable_raw_fini(void)
{
- xt_hook_unlink(&packet_raw, rawtable_ops);
unregister_pernet_subsys(&iptable_raw_net_ops);
+ kfree(rawtable_ops);
}
module_init(iptable_raw_init);
diff --git a/net/ipv4/netfilter/iptable_security.c b/net/ipv4/netfilter/iptable_security.c
index c2e23d5e9cd4..ff226596e4b5 100644
--- a/net/ipv4/netfilter/iptable_security.c
+++ b/net/ipv4/netfilter/iptable_security.c
@@ -28,12 +28,15 @@ MODULE_DESCRIPTION("iptables security table, for MAC rules");
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT)
+static int __net_init iptable_security_table_init(struct net *net);
+
static const struct xt_table security_table = {
.name = "security",
.valid_hooks = SECURITY_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV4,
.priority = NF_IP_PRI_SECURITY,
+ .table_init = iptable_security_table_init,
};
static unsigned int
@@ -51,26 +54,33 @@ iptable_security_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *sectbl_ops __read_mostly;
-static int __net_init iptable_security_net_init(struct net *net)
+static int __net_init iptable_security_table_init(struct net *net)
{
struct ipt_replace *repl;
+ int ret;
+
+ if (net->ipv4.iptable_security)
+ return 0;
repl = ipt_alloc_initial_table(&security_table);
if (repl == NULL)
return -ENOMEM;
- net->ipv4.iptable_security =
- ipt_register_table(net, &security_table, repl);
+ ret = ipt_register_table(net, &security_table, repl, sectbl_ops,
+ &net->ipv4.iptable_security);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv4.iptable_security);
+ return ret;
}
static void __net_exit iptable_security_net_exit(struct net *net)
{
- ipt_unregister_table(net, net->ipv4.iptable_security);
+ if (!net->ipv4.iptable_security)
+ return;
+
+ ipt_unregister_table(net, net->ipv4.iptable_security, sectbl_ops);
+ net->ipv4.iptable_security = NULL;
}
static struct pernet_operations iptable_security_net_ops = {
- .init = iptable_security_net_init,
.exit = iptable_security_net_exit,
};
@@ -78,27 +88,29 @@ static int __init iptable_security_init(void)
{
int ret;
+ sectbl_ops = xt_hook_ops_alloc(&security_table, iptable_security_hook);
+ if (IS_ERR(sectbl_ops))
+ return PTR_ERR(sectbl_ops);
+
ret = register_pernet_subsys(&iptable_security_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(sectbl_ops);
return ret;
-
- sectbl_ops = xt_hook_link(&security_table, iptable_security_hook);
- if (IS_ERR(sectbl_ops)) {
- ret = PTR_ERR(sectbl_ops);
- goto cleanup_table;
}
- return ret;
+ ret = iptable_security_table_init(&init_net);
+ if (ret) {
+ unregister_pernet_subsys(&iptable_security_net_ops);
+ kfree(sectbl_ops);
+ }
-cleanup_table:
- unregister_pernet_subsys(&iptable_security_net_ops);
return ret;
}
static void __exit iptable_security_fini(void)
{
- xt_hook_unlink(&security_table, sectbl_ops);
unregister_pernet_subsys(&iptable_security_net_ops);
+ kfree(sectbl_ops);
}
module_init(iptable_security_init);
diff --git a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
index 461ca926fd39..e3c46e8e2762 100644
--- a/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_conntrack_l3proto_ipv4.c
@@ -451,7 +451,7 @@ static int __init nf_conntrack_l3proto_ipv4_init(void)
ret = nf_register_sockopt(&so_getorigdst);
if (ret < 0) {
- printk(KERN_ERR "Unable to register netfilter socket option\n");
+ pr_err("Unable to register netfilter socket option\n");
return ret;
}
diff --git a/net/ipv4/netfilter/nf_defrag_ipv4.c b/net/ipv4/netfilter/nf_defrag_ipv4.c
index 0e5591c2ee9f..d88da36b383c 100644
--- a/net/ipv4/netfilter/nf_defrag_ipv4.c
+++ b/net/ipv4/netfilter/nf_defrag_ipv4.c
@@ -27,16 +27,12 @@ static int nf_ct_ipv4_gather_frags(struct net *net, struct sk_buff *skb,
{
int err;
- skb_orphan(skb);
-
local_bh_disable();
err = ip_defrag(net, skb, user);
local_bh_enable();
- if (!err) {
- ip_send_check(ip_hdr(skb));
+ if (!err)
skb->ignore_df = 1;
- }
return err;
}
@@ -67,10 +63,9 @@ static unsigned int ipv4_conntrack_defrag(void *priv,
const struct nf_hook_state *state)
{
struct sock *sk = skb->sk;
- struct inet_sock *inet = inet_sk(skb->sk);
- if (sk && (sk->sk_family == PF_INET) &&
- inet->nodefrag)
+ if (sk && sk_fullsock(sk) && (sk->sk_family == PF_INET) &&
+ inet_sk(sk)->nodefrag)
return NF_ACCEPT;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
diff --git a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
index 5075b7ecd26d..f8aad03d674b 100644
--- a/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_l3proto_ipv4.c
@@ -127,28 +127,15 @@ static void nf_nat_ipv4_csum_recalc(struct sk_buff *skb,
u8 proto, void *data, __sum16 *check,
int datalen, int oldlen)
{
- const struct iphdr *iph = ip_hdr(skb);
- struct rtable *rt = skb_rtable(skb);
-
if (skb->ip_summed != CHECKSUM_PARTIAL) {
- if (!(rt->rt_flags & RTCF_LOCAL) &&
- (!skb->dev || skb->dev->features & NETIF_F_V4_CSUM)) {
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_headroom(skb) +
- skb_network_offset(skb) +
- ip_hdrlen(skb);
- skb->csum_offset = (void *)check - data;
- *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr,
- datalen, proto, 0);
- } else {
- *check = 0;
- *check = csum_tcpudp_magic(iph->saddr, iph->daddr,
- datalen, proto,
- csum_partial(data, datalen,
- 0));
- if (proto == IPPROTO_UDP && !*check)
- *check = CSUM_MANGLED_0;
- }
+ const struct iphdr *iph = ip_hdr(skb);
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
+ ip_hdrlen(skb);
+ skb->csum_offset = (void *)check - data;
+ *check = ~csum_tcpudp_magic(iph->saddr, iph->daddr, datalen,
+ proto, 0);
} else
inet_proto_csum_replace2(check, skb,
htons(oldlen), htons(datalen), true);
diff --git a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
index c6eb42100e9a..ea91058b5f6f 100644
--- a/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
+++ b/net/ipv4/netfilter/nf_nat_masquerade_ipv4.c
@@ -108,10 +108,18 @@ static int masq_inet_event(struct notifier_block *this,
unsigned long event,
void *ptr)
{
- struct net_device *dev = ((struct in_ifaddr *)ptr)->ifa_dev->dev;
+ struct in_device *idev = ((struct in_ifaddr *)ptr)->ifa_dev;
struct netdev_notifier_info info;
- netdev_notifier_info_init(&info, dev);
+ /* The masq_dev_notifier will catch the case of the device going
+ * down. So if the inetdev is dead and being destroyed we have
+ * no work to do. Otherwise this is an individual address removal
+ * and we have to perform the flush.
+ */
+ if (idev->dead)
+ return NOTIFY_DONE;
+
+ netdev_notifier_info_init(&info, idev->dev);
return masq_device_event(this, event, &info);
}
diff --git a/net/ipv4/netfilter/nf_nat_pptp.c b/net/ipv4/netfilter/nf_nat_pptp.c
index 657d2307f031..b3ca21b2ba9b 100644
--- a/net/ipv4/netfilter/nf_nat_pptp.c
+++ b/net/ipv4/netfilter/nf_nat_pptp.c
@@ -45,7 +45,7 @@ static void pptp_nat_expected(struct nf_conn *ct,
struct net *net = nf_ct_net(ct);
const struct nf_conn *master = ct->master;
struct nf_conntrack_expect *other_exp;
- struct nf_conntrack_tuple t;
+ struct nf_conntrack_tuple t = {};
const struct nf_ct_pptp_master *ct_pptp_info;
const struct nf_nat_pptp *nat_pptp_info;
struct nf_nat_range range;
diff --git a/net/ipv4/netfilter/nf_nat_snmp_basic.c b/net/ipv4/netfilter/nf_nat_snmp_basic.c
index ddb894ac1458..c9b52c361da2 100644
--- a/net/ipv4/netfilter/nf_nat_snmp_basic.c
+++ b/net/ipv4/netfilter/nf_nat_snmp_basic.c
@@ -1048,7 +1048,7 @@ static int snmp_parse_mangle(unsigned char *msg,
if (!asn1_uint_decode (&ctx, end, &vers))
return 0;
if (debug > 1)
- printk(KERN_DEBUG "bsalg: snmp version: %u\n", vers + 1);
+ pr_debug("bsalg: snmp version: %u\n", vers + 1);
if (vers > 1)
return 1;
@@ -1064,10 +1064,10 @@ static int snmp_parse_mangle(unsigned char *msg,
if (debug > 1) {
unsigned int i;
- printk(KERN_DEBUG "bsalg: community: ");
+ pr_debug("bsalg: community: ");
for (i = 0; i < comm.len; i++)
- printk("%c", comm.data[i]);
- printk("\n");
+ pr_cont("%c", comm.data[i]);
+ pr_cont("\n");
}
kfree(comm.data);
@@ -1091,9 +1091,9 @@ static int snmp_parse_mangle(unsigned char *msg,
};
if (pdutype > SNMP_PDU_TRAP2)
- printk(KERN_DEBUG "bsalg: bad pdu type %u\n", pdutype);
+ pr_debug("bsalg: bad pdu type %u\n", pdutype);
else
- printk(KERN_DEBUG "bsalg: pdu: %s\n", pdus[pdutype]);
+ pr_debug("bsalg: pdu: %s\n", pdus[pdutype]);
}
if (pdutype != SNMP_PDU_RESPONSE &&
pdutype != SNMP_PDU_TRAP1 && pdutype != SNMP_PDU_TRAP2)
@@ -1119,7 +1119,7 @@ static int snmp_parse_mangle(unsigned char *msg,
return 0;
if (debug > 1)
- printk(KERN_DEBUG "bsalg: request: id=0x%lx error_status=%u "
+ pr_debug("bsalg: request: id=0x%lx error_status=%u "
"error_index=%u\n", req.id, req.error_status,
req.error_index);
}
@@ -1145,13 +1145,13 @@ static int snmp_parse_mangle(unsigned char *msg,
}
if (debug > 1) {
- printk(KERN_DEBUG "bsalg: object: ");
+ pr_debug("bsalg: object: ");
for (i = 0; i < obj->id_len; i++) {
if (i > 0)
- printk(".");
- printk("%lu", obj->id[i]);
+ pr_cont(".");
+ pr_cont("%lu", obj->id[i]);
}
- printk(": type=%u\n", obj->type);
+ pr_cont(": type=%u\n", obj->type);
}
diff --git a/net/ipv4/netfilter/nf_reject_ipv4.c b/net/ipv4/netfilter/nf_reject_ipv4.c
index c747b2d9eb77..b6ea57ec5e14 100644
--- a/net/ipv4/netfilter/nf_reject_ipv4.c
+++ b/net/ipv4/netfilter/nf_reject_ipv4.c
@@ -14,7 +14,6 @@
#include <net/netfilter/ipv4/nf_reject.h>
#include <linux/netfilter_ipv4.h>
#include <linux/netfilter_bridge.h>
-#include <net/netfilter/ipv4/nf_reject.h>
const struct tcphdr *nf_reject_ip_tcphdr_get(struct sk_buff *oldskb,
struct tcphdr *_oth, int hook)
diff --git a/net/ipv4/netfilter/nf_tables_arp.c b/net/ipv4/netfilter/nf_tables_arp.c
index 9d09d4f59545..cd84d4295a20 100644
--- a/net/ipv4/netfilter/nf_tables_arp.c
+++ b/net/ipv4/netfilter/nf_tables_arp.c
@@ -57,7 +57,7 @@ err:
static void nf_tables_arp_exit_net(struct net *net)
{
- nft_unregister_afinfo(net->nft.arp);
+ nft_unregister_afinfo(net, net->nft.arp);
kfree(net->nft.arp);
}
diff --git a/net/ipv4/netfilter/nf_tables_ipv4.c b/net/ipv4/netfilter/nf_tables_ipv4.c
index ca9dc3c46c4f..e44ba3b12fbb 100644
--- a/net/ipv4/netfilter/nf_tables_ipv4.c
+++ b/net/ipv4/netfilter/nf_tables_ipv4.c
@@ -78,7 +78,7 @@ err:
static void nf_tables_ipv4_exit_net(struct net *net)
{
- nft_unregister_afinfo(net->nft.ipv4);
+ nft_unregister_afinfo(net, net->nft.ipv4);
kfree(net->nft.ipv4);
}
diff --git a/net/ipv4/netfilter/nft_masq_ipv4.c b/net/ipv4/netfilter/nft_masq_ipv4.c
index b72ffc58e255..51ced81b616c 100644
--- a/net/ipv4/netfilter/nft_masq_ipv4.c
+++ b/net/ipv4/netfilter/nft_masq_ipv4.c
@@ -25,7 +25,12 @@ static void nft_masq_ipv4_eval(const struct nft_expr *expr,
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
-
+ if (priv->sreg_proto_min) {
+ range.min_proto.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_min];
+ range.max_proto.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_max];
+ }
regs->verdict.code = nf_nat_masquerade_ipv4(pkt->skb, pkt->hook,
&range, pkt->out);
}
diff --git a/net/ipv4/ping.c b/net/ipv4/ping.c
index e89094ab5ddb..cf9700b1a106 100644
--- a/net/ipv4/ping.c
+++ b/net/ipv4/ping.c
@@ -145,10 +145,12 @@ fail:
}
EXPORT_SYMBOL_GPL(ping_get_port);
-void ping_hash(struct sock *sk)
+int ping_hash(struct sock *sk)
{
pr_debug("ping_hash(sk->port=%u)\n", inet_sk(sk)->inet_num);
BUG(); /* "Please do not press this button again." */
+
+ return 0;
}
void ping_unhash(struct sock *sk)
@@ -746,8 +748,10 @@ static int ping_v4_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (msg->msg_controllen) {
err = ip_cmsg_send(sock_net(sk), msg, &ipc, false);
- if (err)
+ if (unlikely(err)) {
+ kfree(ipc.opt);
return err;
+ }
if (ipc.opt)
free = 1;
}
@@ -1063,6 +1067,7 @@ static struct sock *ping_get_idx(struct seq_file *seq, loff_t pos)
}
void *ping_seq_start(struct seq_file *seq, loff_t *pos, sa_family_t family)
+ __acquires(ping_table.lock)
{
struct ping_iter_state *state = seq->private;
state->bucket = 0;
@@ -1094,6 +1099,7 @@ void *ping_seq_next(struct seq_file *seq, void *v, loff_t *pos)
EXPORT_SYMBOL_GPL(ping_seq_next);
void ping_seq_stop(struct seq_file *seq, void *v)
+ __releases(ping_table.lock)
{
read_unlock_bh(&ping_table.lock);
}
@@ -1136,13 +1142,6 @@ static int ping_v4_seq_show(struct seq_file *seq, void *v)
return 0;
}
-static const struct seq_operations ping_v4_seq_ops = {
- .show = ping_v4_seq_show,
- .start = ping_v4_seq_start,
- .next = ping_seq_next,
- .stop = ping_seq_stop,
-};
-
static int ping_seq_open(struct inode *inode, struct file *file)
{
struct ping_seq_afinfo *afinfo = PDE_DATA(inode);
diff --git a/net/ipv4/proc.c b/net/ipv4/proc.c
index 3abd9d7a3adf..9f665b63a927 100644
--- a/net/ipv4/proc.c
+++ b/net/ipv4/proc.c
@@ -390,7 +390,7 @@ static int snmp_seq_show(struct seq_file *seq, void *v)
seq_printf(seq, "\nIp: %d %d",
IPV4_DEVCONF_ALL(net, FORWARDING) ? 1 : 2,
- sysctl_ip_default_ttl);
+ net->ipv4.sysctl_ip_default_ttl);
BUILD_BUG_ON(offsetof(struct ipstats_mib, mibs) != 0);
for (i = 0; snmp4_ipstats_list[i].name != NULL; i++)
diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c
index 8c0d0bdc2a7c..8d22de74080c 100644
--- a/net/ipv4/raw.c
+++ b/net/ipv4/raw.c
@@ -93,7 +93,7 @@ static struct raw_hashinfo raw_v4_hashinfo = {
.lock = __RW_LOCK_UNLOCKED(raw_v4_hashinfo.lock),
};
-void raw_hash_sk(struct sock *sk)
+int raw_hash_sk(struct sock *sk)
{
struct raw_hashinfo *h = sk->sk_prot->h.raw_hash;
struct hlist_head *head;
@@ -104,6 +104,8 @@ void raw_hash_sk(struct sock *sk)
sk_add_node(sk, head);
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
write_unlock_bh(&h->lock);
+
+ return 0;
}
EXPORT_SYMBOL_GPL(raw_hash_sk);
@@ -406,10 +408,12 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4,
ip_select_ident(net, skb, NULL);
iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl);
+ skb->transport_header += iphlen;
+ if (iph->protocol == IPPROTO_ICMP &&
+ length >= iphlen + sizeof(struct icmphdr))
+ icmp_out_count(net, ((struct icmphdr *)
+ skb_transport_header(skb))->type);
}
- if (iph->protocol == IPPROTO_ICMP)
- icmp_out_count(net, ((struct icmphdr *)
- skb_transport_header(skb))->type);
err = NF_HOOK(NFPROTO_IPV4, NF_INET_LOCAL_OUT,
net, sk, skb, NULL, rt->dst.dev,
@@ -545,8 +549,10 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (msg->msg_controllen) {
err = ip_cmsg_send(net, msg, &ipc, false);
- if (err)
+ if (unlikely(err)) {
+ kfree(ipc.opt);
goto out;
+ }
if (ipc.opt)
free = 1;
}
@@ -599,8 +605,11 @@ static int raw_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
(inet->hdrincl ? FLOWI_FLAG_KNOWN_NH : 0),
daddr, saddr, 0, 0);
- if (!saddr && ipc.oif)
- l3mdev_get_saddr(net, ipc.oif, &fl4);
+ if (!saddr && ipc.oif) {
+ err = l3mdev_get_saddr(net, ipc.oif, &fl4);
+ if (err < 0)
+ goto done;
+ }
if (!inet->hdrincl) {
rfv.msg = msg;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 85f184e429c6..02c62299d717 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -129,6 +129,7 @@ static int ip_rt_mtu_expires __read_mostly = 10 * 60 * HZ;
static int ip_rt_min_pmtu __read_mostly = 512 + 20 + 20;
static int ip_rt_min_advmss __read_mostly = 256;
+static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
/*
* Interface to generic destination cache.
*/
@@ -755,7 +756,7 @@ static void __ip_do_redirect(struct rtable *rt, struct sk_buff *skb, struct flow
struct fib_nh *nh = &FIB_RES_NH(res);
update_or_create_fnhe(nh, fl4->daddr, new_gw,
- 0, 0);
+ 0, jiffies + ip_rt_gc_timeout);
}
if (kill_route)
rt->dst.obsolete = DST_OBSOLETE_KILL;
@@ -1556,6 +1557,36 @@ static void ip_handle_martian_source(struct net_device *dev,
#endif
}
+static void ip_del_fnhe(struct fib_nh *nh, __be32 daddr)
+{
+ struct fnhe_hash_bucket *hash;
+ struct fib_nh_exception *fnhe, __rcu **fnhe_p;
+ u32 hval = fnhe_hashfun(daddr);
+
+ spin_lock_bh(&fnhe_lock);
+
+ hash = rcu_dereference_protected(nh->nh_exceptions,
+ lockdep_is_held(&fnhe_lock));
+ hash += hval;
+
+ fnhe_p = &hash->chain;
+ fnhe = rcu_dereference_protected(*fnhe_p, lockdep_is_held(&fnhe_lock));
+ while (fnhe) {
+ if (fnhe->fnhe_daddr == daddr) {
+ rcu_assign_pointer(*fnhe_p, rcu_dereference_protected(
+ fnhe->fnhe_next, lockdep_is_held(&fnhe_lock)));
+ fnhe_flush_routes(fnhe);
+ kfree_rcu(fnhe, rcu);
+ break;
+ }
+ fnhe_p = &fnhe->fnhe_next;
+ fnhe = rcu_dereference_protected(fnhe->fnhe_next,
+ lockdep_is_held(&fnhe_lock));
+ }
+
+ spin_unlock_bh(&fnhe_lock);
+}
+
/* called in rcu_read_lock() section */
static int __mkroute_input(struct sk_buff *skb,
const struct fib_result *res,
@@ -1609,11 +1640,20 @@ static int __mkroute_input(struct sk_buff *skb,
fnhe = find_exception(&FIB_RES_NH(*res), daddr);
if (do_cache) {
- if (fnhe)
+ if (fnhe) {
rth = rcu_dereference(fnhe->fnhe_rth_input);
- else
- rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+ if (rth && rth->dst.expires &&
+ time_after(jiffies, rth->dst.expires)) {
+ ip_del_fnhe(&FIB_RES_NH(*res), daddr);
+ fnhe = NULL;
+ } else {
+ goto rt_cache;
+ }
+ }
+
+ rth = rcu_dereference(FIB_RES_NH(*res).nh_rth_input);
+rt_cache:
if (rt_cache_valid(rth)) {
skb_dst_set_noref(skb, &rth->dst);
goto out;
@@ -2014,19 +2054,29 @@ static struct rtable *__mkroute_output(const struct fib_result *res,
struct fib_nh *nh = &FIB_RES_NH(*res);
fnhe = find_exception(nh, fl4->daddr);
- if (fnhe)
+ if (fnhe) {
prth = &fnhe->fnhe_rth_output;
- else {
- if (unlikely(fl4->flowi4_flags &
- FLOWI_FLAG_KNOWN_NH &&
- !(nh->nh_gw &&
- nh->nh_scope == RT_SCOPE_LINK))) {
- do_cache = false;
- goto add;
+ rth = rcu_dereference(*prth);
+ if (rth && rth->dst.expires &&
+ time_after(jiffies, rth->dst.expires)) {
+ ip_del_fnhe(nh, fl4->daddr);
+ fnhe = NULL;
+ } else {
+ goto rt_cache;
}
- prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
}
+
+ if (unlikely(fl4->flowi4_flags &
+ FLOWI_FLAG_KNOWN_NH &&
+ !(nh->nh_gw &&
+ nh->nh_scope == RT_SCOPE_LINK))) {
+ do_cache = false;
+ goto add;
+ }
+ prth = raw_cpu_ptr(nh->nh_pcpu_rth_output);
rth = rcu_dereference(*prth);
+
+rt_cache:
if (rt_cache_valid(rth)) {
dst_hold(&rth->dst);
return rth;
@@ -2569,7 +2619,6 @@ void ip_rt_multicast_event(struct in_device *in_dev)
}
#ifdef CONFIG_SYSCTL
-static int ip_rt_gc_timeout __read_mostly = RT_GC_TIMEOUT;
static int ip_rt_gc_interval __read_mostly = 60 * HZ;
static int ip_rt_gc_min_interval __read_mostly = HZ / 2;
static int ip_rt_gc_elasticity __read_mostly = 8;
diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index 4cbe9f0a4281..4c04f09338e3 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -19,8 +19,6 @@
#include <net/tcp.h>
#include <net/route.h>
-extern int sysctl_tcp_syncookies;
-
static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
#define COOKIEBITS 24 /* Upper bits store count */
@@ -50,8 +48,7 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
#define TSBITS 6
#define TSMASK (((__u32)1 << TSBITS) - 1)
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
- ipv4_cookie_scratch);
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv4_cookie_scratch);
static u32 cookie_hash(__be32 saddr, __be32 daddr, __be16 sport, __be16 dport,
u32 count, int c)
@@ -307,7 +304,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
__u8 rcv_wscale;
struct flowi4 fl4;
- if (!sysctl_tcp_syncookies || !th->ack || th->rst)
+ if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst)
goto out;
if (tcp_synq_no_recent_overflow(sk))
@@ -351,7 +348,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
treq->snt_synack.v64 = 0;
treq->tfo_listener = false;
- ireq->ir_iif = sk->sk_bound_dev_if;
+ ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
/* We throwed the options of the initial SYN away, so we hope
* the ACK carries the same options again (see RFC1122 4.2.3.8)
@@ -371,7 +368,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
* hasn't changed since we received the original syn, but I see
* no easy way to do this.
*/
- flowi4_init_output(&fl4, sk->sk_bound_dev_if, ireq->ir_mark,
+ flowi4_init_output(&fl4, ireq->ir_iif, ireq->ir_mark,
RT_CONN_FLAGS(sk), RT_SCOPE_UNIVERSE, IPPROTO_TCP,
inet_sk_flowi_flags(sk),
opt->srr ? opt->faddr : ireq->ir_rmt_addr,
diff --git a/net/ipv4/sysctl_net_ipv4.c b/net/ipv4/sysctl_net_ipv4.c
index 25300c5e283b..1e1fe6086dd9 100644
--- a/net/ipv4/sysctl_net_ipv4.c
+++ b/net/ipv4/sysctl_net_ipv4.c
@@ -24,7 +24,6 @@
#include <net/cipso_ipv4.h>
#include <net/inet_frag.h>
#include <net/ping.h>
-#include <net/tcp_memcontrol.h>
static int zero;
static int one = 1;
@@ -48,14 +47,14 @@ static void set_local_port_range(struct net *net, int range[2])
{
bool same_parity = !((range[0] ^ range[1]) & 1);
- write_seqlock(&net->ipv4.ip_local_ports.lock);
+ write_seqlock_bh(&net->ipv4.ip_local_ports.lock);
if (same_parity && !net->ipv4.ip_local_ports.warned) {
net->ipv4.ip_local_ports.warned = true;
pr_err_ratelimited("ip_local_port_range: prefer different parity for start/end values.\n");
}
net->ipv4.ip_local_ports.range[0] = range[0];
net->ipv4.ip_local_ports.range[1] = range[1];
- write_sequnlock(&net->ipv4.ip_local_ports.lock);
+ write_sequnlock_bh(&net->ipv4.ip_local_ports.lock);
}
/* Validate changes from /proc interface. */
@@ -284,31 +283,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "ip_default_ttl",
- .data = &sysctl_ip_default_ttl,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &ip_ttl_min,
- .extra2 = &ip_ttl_max,
- },
- {
- .procname = "tcp_syn_retries",
- .data = &sysctl_tcp_syn_retries,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &tcp_syn_retries_min,
- .extra2 = &tcp_syn_retries_max
- },
- {
- .procname = "tcp_synack_retries",
- .data = &sysctl_tcp_synack_retries,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_max_orphans",
.data = &sysctl_tcp_max_orphans,
.maxlen = sizeof(int),
@@ -323,72 +297,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "ip_early_demux",
- .data = &sysctl_ip_early_demux,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "ip_dynaddr",
- .data = &sysctl_ip_dynaddr,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_keepalive_time",
- .data = &sysctl_tcp_keepalive_time,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "tcp_keepalive_probes",
- .data = &sysctl_tcp_keepalive_probes,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_keepalive_intvl",
- .data = &sysctl_tcp_keepalive_intvl,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
- {
- .procname = "tcp_retries1",
- .data = &sysctl_tcp_retries1,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra2 = &tcp_retr1_max
- },
- {
- .procname = "tcp_retries2",
- .data = &sysctl_tcp_retries2,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "tcp_fin_timeout",
- .data = &sysctl_tcp_fin_timeout,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_jiffies,
- },
-#ifdef CONFIG_SYN_COOKIES
- {
- .procname = "tcp_syncookies",
- .data = &sysctl_tcp_syncookies,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
-#endif
- {
.procname = "tcp_fastopen",
.data = &sysctl_tcp_fastopen,
.maxlen = sizeof(int),
@@ -437,30 +345,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec
},
{
- .procname = "igmp_max_memberships",
- .data = &sysctl_igmp_max_memberships,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
- .procname = "igmp_max_msf",
- .data = &sysctl_igmp_max_msf,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
-#ifdef CONFIG_IP_MULTICAST
- {
- .procname = "igmp_qrv",
- .data = &sysctl_igmp_qrv,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec_minmax,
- .extra1 = &one
- },
-#endif
- {
.procname = "inet_peer_threshold",
.data = &inet_peer_threshold,
.maxlen = sizeof(int),
@@ -482,13 +366,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec_jiffies,
},
{
- .procname = "tcp_orphan_retries",
- .data = &sysctl_tcp_orphan_retries,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_fack",
.data = &sysctl_tcp_fack,
.maxlen = sizeof(int),
@@ -503,13 +380,6 @@ static struct ctl_table ipv4_table[] = {
.proc_handler = proc_dointvec,
},
{
- .procname = "tcp_reordering",
- .data = &sysctl_tcp_reordering,
- .maxlen = sizeof(int),
- .mode = 0644,
- .proc_handler = proc_dointvec
- },
- {
.procname = "tcp_max_reordering",
.data = &sysctl_tcp_max_reordering,
.maxlen = sizeof(int),
@@ -539,13 +409,6 @@ static struct ctl_table ipv4_table[] = {
.extra1 = &one,
},
{
- .procname = "tcp_notsent_lowat",
- .data = &sysctl_tcp_notsent_lowat,
- .maxlen = sizeof(sysctl_tcp_notsent_lowat),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- {
.procname = "tcp_rmem",
.data = &sysctl_tcp_rmem,
.maxlen = sizeof(sysctl_tcp_rmem),
@@ -867,6 +730,29 @@ static struct ctl_table ipv4_net_table[] = {
.proc_handler = proc_dointvec
},
{
+ .procname = "ip_dynaddr",
+ .data = &init_net.ipv4.sysctl_ip_dynaddr,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_early_demux",
+ .data = &init_net.ipv4.sysctl_ip_early_demux,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "ip_default_ttl",
+ .data = &init_net.ipv4.sysctl_ip_default_ttl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &ip_ttl_min,
+ .extra2 = &ip_ttl_max,
+ },
+ {
.procname = "ip_local_port_range",
.maxlen = sizeof(init_net.ipv4.ip_local_ports.range),
.data = &init_net.ipv4.ip_local_ports.range,
@@ -915,6 +801,17 @@ static struct ctl_table ipv4_net_table[] = {
.mode = 0644,
.proc_handler = proc_dointvec,
},
+#ifdef CONFIG_NET_L3_MASTER_DEV
+ {
+ .procname = "tcp_l3mdev_accept",
+ .data = &init_net.ipv4.sysctl_tcp_l3mdev_accept,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &zero,
+ .extra2 = &one,
+ },
+#endif
{
.procname = "tcp_mtu_probing",
.data = &init_net.ipv4.sysctl_tcp_mtu_probing,
@@ -945,11 +842,124 @@ static struct ctl_table ipv4_net_table[] = {
},
{
.procname = "igmp_link_local_mcast_reports",
- .data = &sysctl_igmp_llm_reports,
+ .data = &init_net.ipv4.sysctl_igmp_llm_reports,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "igmp_max_memberships",
+ .data = &init_net.ipv4.sysctl_igmp_max_memberships,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "igmp_max_msf",
+ .data = &init_net.ipv4.sysctl_igmp_max_msf,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#ifdef CONFIG_IP_MULTICAST
+ {
+ .procname = "igmp_qrv",
+ .data = &init_net.ipv4.sysctl_igmp_qrv,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &one
+ },
+#endif
+ {
+ .procname = "tcp_keepalive_time",
+ .data = &init_net.ipv4.sysctl_tcp_keepalive_time,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "tcp_keepalive_probes",
+ .data = &init_net.ipv4.sysctl_tcp_keepalive_probes,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_keepalive_intvl",
+ .data = &init_net.ipv4.sysctl_tcp_keepalive_intvl,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "tcp_syn_retries",
+ .data = &init_net.ipv4.sysctl_tcp_syn_retries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra1 = &tcp_syn_retries_min,
+ .extra2 = &tcp_syn_retries_max
+ },
+ {
+ .procname = "tcp_synack_retries",
+ .data = &init_net.ipv4.sysctl_tcp_synack_retries,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = proc_dointvec
},
+#ifdef CONFIG_SYN_COOKIES
+ {
+ .procname = "tcp_syncookies",
+ .data = &init_net.ipv4.sysctl_tcp_syncookies,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+#endif
+ {
+ .procname = "tcp_reordering",
+ .data = &init_net.ipv4.sysctl_tcp_reordering,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_retries1",
+ .data = &init_net.ipv4.sysctl_tcp_retries1,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_minmax,
+ .extra2 = &tcp_retr1_max
+ },
+ {
+ .procname = "tcp_retries2",
+ .data = &init_net.ipv4.sysctl_tcp_retries2,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_orphan_retries",
+ .data = &init_net.ipv4.sysctl_tcp_orphan_retries,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec
+ },
+ {
+ .procname = "tcp_fin_timeout",
+ .data = &init_net.ipv4.sysctl_tcp_fin_timeout,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec_jiffies,
+ },
+ {
+ .procname = "tcp_notsent_lowat",
+ .data = &init_net.ipv4.sysctl_tcp_notsent_lowat,
+ .maxlen = sizeof(unsigned int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ }
};
@@ -978,6 +988,10 @@ static __net_init int ipv4_sysctl_init_net(struct net *net)
if (!net->ipv4.sysctl_local_reserved_ports)
goto err_ports;
+ net->ipv4.sysctl_ip_default_ttl = IPDEFTTL;
+ net->ipv4.sysctl_ip_dynaddr = 0;
+ net->ipv4.sysctl_ip_early_demux = 1;
+
return 0;
err_ports:
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 0cfa7c0c1e80..08b8b960a8ed 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -247,6 +247,7 @@
#define pr_fmt(fmt) "TCP: " fmt
+#include <crypto/hash.h>
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/types.h>
@@ -266,7 +267,6 @@
#include <linux/swap.h>
#include <linux/cache.h>
#include <linux/err.h>
-#include <linux/crypto.h>
#include <linux/time.h>
#include <linux/slab.h>
@@ -279,10 +279,9 @@
#include <asm/uaccess.h>
#include <asm/ioctls.h>
+#include <asm/unaligned.h>
#include <net/busy_poll.h>
-int sysctl_tcp_fin_timeout __read_mostly = TCP_FIN_TIMEOUT;
-
int sysctl_tcp_min_tso_segs __read_mostly = 2;
int sysctl_tcp_autocorking __read_mostly = 1;
@@ -405,7 +404,7 @@ void tcp_init_sock(struct sock *sk)
tp->mss_cache = TCP_MSS_DEFAULT;
u64_stats_init(&tp->syncp);
- tp->reordering = sysctl_tcp_reordering;
+ tp->reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
tcp_enable_early_retrans(tp);
tcp_assign_congestion_control(sk);
@@ -422,7 +421,8 @@ void tcp_init_sock(struct sock *sk)
sk->sk_rcvbuf = sysctl_tcp_rmem[1];
local_bh_disable();
- sock_update_memcg(sk);
+ if (mem_cgroup_sockets_enabled)
+ sock_update_memcg(sk);
sk_sockets_allocated_inc(sk);
local_bh_enable();
}
@@ -451,11 +451,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
unsigned int mask;
struct sock *sk = sock->sk;
const struct tcp_sock *tp = tcp_sk(sk);
+ int state;
sock_rps_record_flow(sk);
sock_poll_wait(file, sk_sleep(sk), wait);
- if (sk->sk_state == TCP_LISTEN)
+
+ state = sk_state_load(sk);
+ if (state == TCP_LISTEN)
return inet_csk_listen_poll(sk);
/* Socket is not locked. We are protected from async events
@@ -492,14 +495,14 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
* NOTE. Check for TCP_CLOSE is added. The goal is to prevent
* blocking on fresh not-connected or disconnected socket. --ANK
*/
- if (sk->sk_shutdown == SHUTDOWN_MASK || sk->sk_state == TCP_CLOSE)
+ if (sk->sk_shutdown == SHUTDOWN_MASK || state == TCP_CLOSE)
mask |= POLLHUP;
if (sk->sk_shutdown & RCV_SHUTDOWN)
mask |= POLLIN | POLLRDNORM | POLLRDHUP;
/* Connected or passive Fast Open socket? */
- if (sk->sk_state != TCP_SYN_SENT &&
- (sk->sk_state != TCP_SYN_RECV || tp->fastopen_rsk)) {
+ if (state != TCP_SYN_SENT &&
+ (state != TCP_SYN_RECV || tp->fastopen_rsk)) {
int target = sock_rcvlowat(sk, 0, INT_MAX);
if (tp->urg_seq == tp->copied_seq &&
@@ -507,9 +510,6 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
tp->urg_data)
target++;
- /* Potential race condition. If read of tp below will
- * escape above sk->sk_state, we can be illegally awaken
- * in SYN_* states. */
if (tp->rcv_nxt - tp->copied_seq >= target)
mask |= POLLIN | POLLRDNORM;
@@ -517,8 +517,7 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (sk_stream_is_writeable(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else { /* send SIGIO later */
- set_bit(SOCK_ASYNC_NOSPACE,
- &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
/* Race breaker. If space is freed after
@@ -557,20 +556,7 @@ int tcp_ioctl(struct sock *sk, int cmd, unsigned long arg)
return -EINVAL;
slow = lock_sock_fast(sk);
- if ((1 << sk->sk_state) & (TCPF_SYN_SENT | TCPF_SYN_RECV))
- answ = 0;
- else if (sock_flag(sk, SOCK_URGINLINE) ||
- !tp->urg_data ||
- before(tp->urg_seq, tp->copied_seq) ||
- !before(tp->urg_seq, tp->rcv_nxt)) {
-
- answ = tp->rcv_nxt - tp->copied_seq;
-
- /* Subtract 1, if FIN was received */
- if (answ && sock_flag(sk, SOCK_DONE))
- answ--;
- } else
- answ = tp->urg_seq - tp->copied_seq;
+ answ = tcp_inq(sk);
unlock_sock_fast(sk, slow);
break;
case SIOCATMARK:
@@ -906,7 +892,7 @@ static ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
goto out_err;
}
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
mss_now = tcp_send_mss(sk, &size_goal, flags);
copied = 0;
@@ -939,7 +925,7 @@ new_segment:
i = skb_shinfo(skb)->nr_frags;
can_coalesce = skb_can_coalesce(skb, i, page, offset);
- if (!can_coalesce && i >= MAX_SKB_FRAGS) {
+ if (!can_coalesce && i >= sysctl_max_skb_frags) {
tcp_mark_push(tp, skb);
goto new_segment;
}
@@ -1019,7 +1005,7 @@ int tcp_sendpage(struct sock *sk, struct page *page, int offset,
ssize_t res;
if (!(sk->sk_route_caps & NETIF_F_SG) ||
- !(sk->sk_route_caps & NETIF_F_ALL_CSUM))
+ !sk_check_csum_caps(sk))
return sock_no_sendpage(sk->sk_socket, page, offset, size,
flags);
@@ -1134,7 +1120,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
}
/* This should be in poll */
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
mss_now = tcp_send_mss(sk, &size_goal, flags);
@@ -1176,7 +1162,7 @@ new_segment:
/*
* Check whether we can use HW checksum.
*/
- if (sk->sk_route_caps & NETIF_F_ALL_CSUM)
+ if (sk_check_csum_caps(sk))
skb->ip_summed = CHECKSUM_PARTIAL;
skb_entail(sk, skb);
@@ -1212,7 +1198,7 @@ new_segment:
if (!skb_can_coalesce(skb, i, pfrag->page,
pfrag->offset)) {
- if (i == MAX_SKB_FRAGS || !sg) {
+ if (i == sysctl_max_skb_frags || !sg) {
tcp_mark_push(tp, skb);
goto new_segment;
}
@@ -1465,8 +1451,10 @@ static struct sk_buff *tcp_recv_skb(struct sock *sk, u32 seq, u32 *off)
while ((skb = skb_peek(&sk->sk_receive_queue)) != NULL) {
offset = seq - TCP_SKB_CB(skb)->seq;
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ pr_err_once("%s: found a SYN, please report !\n", __func__);
offset--;
+ }
if (offset < skb->len || (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)) {
*off = offset;
return skb;
@@ -1656,8 +1644,10 @@ int tcp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int nonblock,
break;
offset = *seq - TCP_SKB_CB(skb)->seq;
- if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)
+ if (unlikely(TCP_SKB_CB(skb)->tcp_flags & TCPHDR_SYN)) {
+ pr_err_once("%s: found a SYN, please report !\n", __func__);
offset--;
+ }
if (offset < skb->len)
goto found_ok_skb;
if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
@@ -1934,7 +1924,7 @@ void tcp_set_state(struct sock *sk, int state)
/* Change state AFTER socket is unhashed to avoid closed
* socket sitting in hash tables.
*/
- sk->sk_state = state;
+ sk_state_store(sk, state);
#ifdef STATE_TRACE
SOCK_DEBUG(sk, "TCP sk=%p, State %s -> %s\n", sk, statename[oldstate], statename[state]);
@@ -2325,6 +2315,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
{
struct tcp_sock *tp = tcp_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
+ struct net *net = sock_net(sk);
int val;
int err = 0;
@@ -2521,7 +2512,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
case TCP_LINGER2:
if (val < 0)
tp->linger2 = -1;
- else if (val > sysctl_tcp_fin_timeout / HZ)
+ else if (val > net->ipv4.sysctl_tcp_fin_timeout / HZ)
tp->linger2 = 0;
else
tp->linger2 = val * HZ;
@@ -2638,13 +2629,16 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
const struct inet_connection_sock *icsk = inet_csk(sk);
u32 now = tcp_time_stamp;
unsigned int start;
+ int notsent_bytes;
+ u64 rate64;
u32 rate;
memset(info, 0, sizeof(*info));
if (sk->sk_type != SOCK_STREAM)
return;
- info->tcpi_state = sk->sk_state;
+ info->tcpi_state = sk_state_load(sk);
+
info->tcpi_ca_state = icsk->icsk_ca_state;
info->tcpi_retransmits = icsk->icsk_retransmits;
info->tcpi_probes = icsk->icsk_probes_out;
@@ -2672,7 +2666,7 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_snd_mss = tp->mss_cache;
info->tcpi_rcv_mss = icsk->icsk_ack.rcv_mss;
- if (sk->sk_state == TCP_LISTEN) {
+ if (info->tcpi_state == TCP_LISTEN) {
info->tcpi_unacked = sk->sk_ack_backlog;
info->tcpi_sacked = sk->sk_max_ack_backlog;
} else {
@@ -2702,18 +2696,27 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info)
info->tcpi_total_retrans = tp->total_retrans;
rate = READ_ONCE(sk->sk_pacing_rate);
- info->tcpi_pacing_rate = rate != ~0U ? rate : ~0ULL;
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ put_unaligned(rate64, &info->tcpi_pacing_rate);
rate = READ_ONCE(sk->sk_max_pacing_rate);
- info->tcpi_max_pacing_rate = rate != ~0U ? rate : ~0ULL;
+ rate64 = rate != ~0U ? rate : ~0ULL;
+ put_unaligned(rate64, &info->tcpi_max_pacing_rate);
do {
start = u64_stats_fetch_begin_irq(&tp->syncp);
- info->tcpi_bytes_acked = tp->bytes_acked;
- info->tcpi_bytes_received = tp->bytes_received;
+ put_unaligned(tp->bytes_acked, &info->tcpi_bytes_acked);
+ put_unaligned(tp->bytes_received, &info->tcpi_bytes_received);
} while (u64_stats_fetch_retry_irq(&tp->syncp, start));
info->tcpi_segs_out = tp->segs_out;
info->tcpi_segs_in = tp->segs_in;
+
+ notsent_bytes = READ_ONCE(tp->write_seq) - READ_ONCE(tp->snd_nxt);
+ info->tcpi_notsent_bytes = max(0, notsent_bytes);
+
+ info->tcpi_min_rtt = tcp_min_rtt(tp);
+ info->tcpi_data_segs_in = tp->data_segs_in;
+ info->tcpi_data_segs_out = tp->data_segs_out;
}
EXPORT_SYMBOL_GPL(tcp_get_info);
@@ -2722,6 +2725,7 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
int val, len;
if (get_user(len, optlen))
@@ -2756,12 +2760,12 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = keepalive_probes(tp);
break;
case TCP_SYNCNT:
- val = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+ val = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
break;
case TCP_LINGER2:
val = tp->linger2;
if (val >= 0)
- val = (val ? : sysctl_tcp_fin_timeout) / HZ;
+ val = (val ? : net->ipv4.sysctl_tcp_fin_timeout) / HZ;
break;
case TCP_DEFER_ACCEPT:
val = retrans_to_secs(icsk->icsk_accept_queue.rskq_defer_accept,
@@ -2938,17 +2942,26 @@ static bool tcp_md5sig_pool_populated = false;
static void __tcp_alloc_md5sig_pool(void)
{
+ struct crypto_ahash *hash;
int cpu;
+ hash = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(hash))
+ return;
+
for_each_possible_cpu(cpu) {
- if (!per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm) {
- struct crypto_hash *hash;
+ struct ahash_request *req;
- hash = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR_OR_NULL(hash))
- return;
- per_cpu(tcp_md5sig_pool, cpu).md5_desc.tfm = hash;
- }
+ if (per_cpu(tcp_md5sig_pool, cpu).md5_req)
+ continue;
+
+ req = ahash_request_alloc(hash, GFP_KERNEL);
+ if (!req)
+ return;
+
+ ahash_request_set_callback(req, 0, NULL, NULL);
+
+ per_cpu(tcp_md5sig_pool, cpu).md5_req = req;
}
/* before setting tcp_md5sig_pool_populated, we must commit all writes
* to memory. See smp_rmb() in tcp_get_md5sig_pool()
@@ -2998,7 +3011,6 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
{
struct scatterlist sg;
struct tcphdr hdr;
- int err;
/* We are not allowed to change tcphdr, make a local copy */
memcpy(&hdr, th, sizeof(hdr));
@@ -3006,8 +3018,8 @@ int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
/* options aren't included in the hash */
sg_init_one(&sg, &hdr, sizeof(hdr));
- err = crypto_hash_update(&hp->md5_desc, &sg, sizeof(hdr));
- return err;
+ ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(hdr));
+ return crypto_ahash_update(hp->md5_req);
}
EXPORT_SYMBOL(tcp_md5_hash_header);
@@ -3016,7 +3028,7 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
{
struct scatterlist sg;
const struct tcphdr *tp = tcp_hdr(skb);
- struct hash_desc *desc = &hp->md5_desc;
+ struct ahash_request *req = hp->md5_req;
unsigned int i;
const unsigned int head_data_len = skb_headlen(skb) > header_len ?
skb_headlen(skb) - header_len : 0;
@@ -3026,7 +3038,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
sg_init_table(&sg, 1);
sg_set_buf(&sg, ((u8 *) tp) + header_len, head_data_len);
- if (crypto_hash_update(desc, &sg, head_data_len))
+ ahash_request_set_crypt(req, &sg, NULL, head_data_len);
+ if (crypto_ahash_update(req))
return 1;
for (i = 0; i < shi->nr_frags; ++i) {
@@ -3036,7 +3049,8 @@ int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
sg_set_page(&sg, page, skb_frag_size(f),
offset_in_page(offset));
- if (crypto_hash_update(desc, &sg, skb_frag_size(f)))
+ ahash_request_set_crypt(req, &sg, NULL, skb_frag_size(f));
+ if (crypto_ahash_update(req))
return 1;
}
@@ -3053,7 +3067,8 @@ int tcp_md5_hash_key(struct tcp_md5sig_pool *hp, const struct tcp_md5sig_key *ke
struct scatterlist sg;
sg_init_one(&sg, key->key, key->keylen);
- return crypto_hash_update(&hp->md5_desc, &sg, key->keylen);
+ ahash_request_set_crypt(hp->md5_req, &sg, NULL, key->keylen);
+ return crypto_ahash_update(hp->md5_req);
}
EXPORT_SYMBOL(tcp_md5_hash_key);
@@ -3080,6 +3095,52 @@ void tcp_done(struct sock *sk)
}
EXPORT_SYMBOL_GPL(tcp_done);
+int tcp_abort(struct sock *sk, int err)
+{
+ if (!sk_fullsock(sk)) {
+ if (sk->sk_state == TCP_NEW_SYN_RECV) {
+ struct request_sock *req = inet_reqsk(sk);
+
+ local_bh_disable();
+ inet_csk_reqsk_queue_drop_and_put(req->rsk_listener,
+ req);
+ local_bh_enable();
+ return 0;
+ }
+ sock_gen_put(sk);
+ return -EOPNOTSUPP;
+ }
+
+ /* Don't race with userspace socket closes such as tcp_close. */
+ lock_sock(sk);
+
+ if (sk->sk_state == TCP_LISTEN) {
+ tcp_set_state(sk, TCP_CLOSE);
+ inet_csk_listen_stop(sk);
+ }
+
+ /* Don't race with BH socket closes such as inet_csk_listen_stop. */
+ local_bh_disable();
+ bh_lock_sock(sk);
+
+ if (!sock_flag(sk, SOCK_DEAD)) {
+ sk->sk_err = err;
+ /* This barrier is coupled with smp_rmb() in tcp_poll() */
+ smp_wmb();
+ sk->sk_error_report(sk);
+ if (tcp_need_reset(sk->sk_state))
+ tcp_send_active_reset(sk, GFP_ATOMIC);
+ tcp_done(sk);
+ }
+
+ bh_unlock_sock(sk);
+ local_bh_enable();
+ release_sock(sk);
+ sock_put(sk);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(tcp_abort);
+
extern struct tcp_congestion_ops tcp_reno;
static __initdata unsigned long thash_entries;
diff --git a/net/ipv4/tcp_diag.c b/net/ipv4/tcp_diag.c
index 479f34946177..4d610934fb39 100644
--- a/net/ipv4/tcp_diag.c
+++ b/net/ipv4/tcp_diag.c
@@ -10,6 +10,8 @@
*/
#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/sock_diag.h>
#include <linux/inet_diag.h>
#include <linux/tcp.h>
@@ -21,7 +23,7 @@ static void tcp_diag_get_info(struct sock *sk, struct inet_diag_msg *r,
{
struct tcp_info *info = _info;
- if (sk->sk_state == TCP_LISTEN) {
+ if (sk_state_load(sk) == TCP_LISTEN) {
r->idiag_rqueue = sk->sk_ack_backlog;
r->idiag_wqueue = sk->sk_max_ack_backlog;
} else if (sk->sk_type == SOCK_STREAM) {
@@ -46,12 +48,29 @@ static int tcp_diag_dump_one(struct sk_buff *in_skb, const struct nlmsghdr *nlh,
return inet_diag_dump_one_icsk(&tcp_hashinfo, in_skb, nlh, req);
}
+#ifdef CONFIG_INET_DIAG_DESTROY
+static int tcp_diag_destroy(struct sk_buff *in_skb,
+ const struct inet_diag_req_v2 *req)
+{
+ struct net *net = sock_net(in_skb->sk);
+ struct sock *sk = inet_diag_find_one_icsk(net, &tcp_hashinfo, req);
+
+ if (IS_ERR(sk))
+ return PTR_ERR(sk);
+
+ return sock_diag_destroy(sk, ECONNABORTED);
+}
+#endif
+
static const struct inet_diag_handler tcp_diag_handler = {
.dump = tcp_diag_dump,
.dump_one = tcp_diag_dump_one,
.idiag_get_info = tcp_diag_get_info,
.idiag_type = IPPROTO_TCP,
.idiag_info_size = sizeof(struct tcp_info),
+#ifdef CONFIG_INET_DIAG_DESTROY
+ .destroy = tcp_diag_destroy,
+#endif
};
static int __init tcp_diag_init(void)
diff --git a/net/ipv4/tcp_fastopen.c b/net/ipv4/tcp_fastopen.c
index 55be6ac70cff..cffd8f9ed1a9 100644
--- a/net/ipv4/tcp_fastopen.c
+++ b/net/ipv4/tcp_fastopen.c
@@ -1,3 +1,4 @@
+#include <linux/crypto.h>
#include <linux/err.h>
#include <linux/init.h>
#include <linux/kernel.h>
@@ -124,6 +125,49 @@ static bool tcp_fastopen_cookie_gen(struct request_sock *req,
return false;
}
+
+/* If an incoming SYN or SYNACK frame contains a payload and/or FIN,
+ * queue this additional data / FIN.
+ */
+void tcp_fastopen_add_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (TCP_SKB_CB(skb)->end_seq == tp->rcv_nxt)
+ return;
+
+ skb = skb_clone(skb, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ skb_dst_drop(skb);
+ /* segs_in has been initialized to 1 in tcp_create_openreq_child().
+ * Hence, reset segs_in to 0 before calling tcp_segs_in()
+ * to avoid double counting. Also, tcp_segs_in() expects
+ * skb->len to include the tcp_hdrlen. Hence, it should
+ * be called before __skb_pull().
+ */
+ tp->segs_in = 0;
+ tcp_segs_in(tp, skb);
+ __skb_pull(skb, tcp_hdrlen(skb));
+ skb_set_owner_r(skb, sk);
+
+ TCP_SKB_CB(skb)->seq++;
+ TCP_SKB_CB(skb)->tcp_flags &= ~TCPHDR_SYN;
+
+ tp->rcv_nxt = TCP_SKB_CB(skb)->end_seq;
+ __skb_queue_tail(&sk->sk_receive_queue, skb);
+ tp->syn_data_acked = 1;
+
+ /* u64_stats_update_begin(&tp->syncp) not needed here,
+ * as we certainly are not changing upper 32bit value (0)
+ */
+ tp->bytes_received = skb->len;
+
+ if (TCP_SKB_CB(skb)->tcp_flags & TCPHDR_FIN)
+ tcp_fin(sk);
+}
+
static struct sock *tcp_fastopen_create_child(struct sock *sk,
struct sk_buff *skb,
struct dst_entry *dst,
@@ -132,7 +176,6 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
struct tcp_sock *tp;
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
struct sock *child;
- u32 end_seq;
bool own_req;
req->num_retrans = 0;
@@ -178,35 +221,11 @@ static struct sock *tcp_fastopen_create_child(struct sock *sk,
tcp_init_metrics(child);
tcp_init_buffer_space(child);
- /* Queue the data carried in the SYN packet.
- * We used to play tricky games with skb_get().
- * With lockless listener, it is a dead end.
- * Do not think about it.
- *
- * XXX (TFO) - we honor a zero-payload TFO request for now,
- * (any reason not to?) but no need to queue the skb since
- * there is no data. How about SYN+FIN?
- */
- end_seq = TCP_SKB_CB(skb)->end_seq;
- if (end_seq != TCP_SKB_CB(skb)->seq + 1) {
- struct sk_buff *skb2 = skb_clone(skb, GFP_ATOMIC);
-
- if (likely(skb2)) {
- skb_dst_drop(skb2);
- __skb_pull(skb2, tcp_hdrlen(skb));
- skb_set_owner_r(skb2, child);
- __skb_queue_tail(&child->sk_receive_queue, skb2);
- tp->syn_data_acked = 1;
-
- /* u64_stats_update_begin(&tp->syncp) not needed here,
- * as we certainly are not changing upper 32bit value (0)
- */
- tp->bytes_received = end_seq - TCP_SKB_CB(skb)->seq - 1;
- } else {
- end_seq = TCP_SKB_CB(skb)->seq + 1;
- }
- }
- tcp_rsk(req)->rcv_nxt = tp->rcv_nxt = end_seq;
+ tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+
+ tcp_fastopen_add_skb(child, skb);
+
+ tcp_rsk(req)->rcv_nxt = tp->rcv_nxt;
/* tcp_conn_request() is sending the SYNACK,
* and queues the child into listener accept queue.
*/
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index fdd88c3803a6..e6e65f79ade8 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -80,9 +80,7 @@ int sysctl_tcp_timestamps __read_mostly = 1;
int sysctl_tcp_window_scaling __read_mostly = 1;
int sysctl_tcp_sack __read_mostly = 1;
int sysctl_tcp_fack __read_mostly = 1;
-int sysctl_tcp_reordering __read_mostly = TCP_FASTRETRANS_THRESH;
int sysctl_tcp_max_reordering __read_mostly = 300;
-EXPORT_SYMBOL(sysctl_tcp_reordering);
int sysctl_tcp_dsack __read_mostly = 1;
int sysctl_tcp_app_win __read_mostly = 31;
int sysctl_tcp_adv_win_scale __read_mostly = 1;
@@ -126,6 +124,10 @@ int sysctl_tcp_invalid_ratelimit __read_mostly = HZ/2;
#define TCP_REMNANT (TCP_FLAG_FIN|TCP_FLAG_URG|TCP_FLAG_SYN|TCP_FLAG_PSH)
#define TCP_HP_BITS (~(TCP_RESERVED_BITS|TCP_FLAG_PSH))
+#define REXMIT_NONE 0 /* no loss recovery to do */
+#define REXMIT_LOST 1 /* retransmit packets marked lost */
+#define REXMIT_NEW 2 /* FRTO-style transmit of unsent/new packets */
+
/* Adapt the MSS value used to make delayed ack decision to the
* real world.
*/
@@ -1210,6 +1212,7 @@ static u8 tcp_sacktag_one(struct sock *sk,
sacked |= TCPCB_SACKED_ACKED;
state->flag |= FLAG_DATA_SACKED;
tp->sacked_out += pcount;
+ tp->delivered += pcount; /* Out-of-order packets delivered */
fack_count += pcount;
@@ -1821,8 +1824,12 @@ static void tcp_check_reno_reordering(struct sock *sk, const int addend)
static void tcp_add_reno_sack(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ u32 prior_sacked = tp->sacked_out;
+
tp->sacked_out++;
tcp_check_reno_reordering(sk, 0);
+ if (tp->sacked_out > prior_sacked)
+ tp->delivered++; /* Some out-of-order packet is delivered */
tcp_verify_left_out(tp);
}
@@ -1834,6 +1841,7 @@ static void tcp_remove_reno_sacks(struct sock *sk, int acked)
if (acked > 0) {
/* One ACK acked hole. The rest eat duplicate ACKs. */
+ tp->delivered += max_t(int, acked - tp->sacked_out, 1);
if (acked - 1 >= tp->sacked_out)
tp->sacked_out = 0;
else
@@ -1873,6 +1881,7 @@ void tcp_enter_loss(struct sock *sk)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct sk_buff *skb;
bool new_recovery = icsk->icsk_ca_state < TCP_CA_Recovery;
bool is_reneg; /* is receiver reneging on SACKs? */
@@ -1923,9 +1932,9 @@ void tcp_enter_loss(struct sock *sk)
* suggests that the degree of reordering is over-estimated.
*/
if (icsk->icsk_ca_state <= TCP_CA_Disorder &&
- tp->sacked_out >= sysctl_tcp_reordering)
+ tp->sacked_out >= net->ipv4.sysctl_tcp_reordering)
tp->reordering = min_t(unsigned int, tp->reordering,
- sysctl_tcp_reordering);
+ net->ipv4.sysctl_tcp_reordering);
tcp_set_ca_state(sk, TCP_CA_Loss);
tp->high_seq = tp->snd_nxt;
tcp_ecn_queue_cwr(tp);
@@ -2109,6 +2118,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
__u32 packets_out;
+ int tcp_reordering = sock_net(sk)->ipv4.sysctl_tcp_reordering;
/* Trick#1: The loss is proven. */
if (tp->lost_out)
@@ -2123,7 +2133,7 @@ static bool tcp_time_to_recover(struct sock *sk, int flag)
*/
packets_out = tp->packets_out;
if (packets_out <= tp->reordering &&
- tp->sacked_out >= max_t(__u32, packets_out/2, sysctl_tcp_reordering) &&
+ tp->sacked_out >= max_t(__u32, packets_out/2, tcp_reordering) &&
!tcp_may_send_now(sk)) {
/* We have nothing to send. This connection is limited
* either by receiver window or by application.
@@ -2164,8 +2174,7 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
{
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
- int cnt, oldcnt;
- int err;
+ int cnt, oldcnt, lost;
unsigned int mss;
/* Use SACK to deduce losses of new sequences sent during recovery */
const u32 loss_high = tcp_is_sack(tp) ? tp->snd_nxt : tp->high_seq;
@@ -2205,9 +2214,10 @@ static void tcp_mark_head_lost(struct sock *sk, int packets, int mark_head)
break;
mss = tcp_skb_mss(skb);
- err = tcp_fragment(sk, skb, (packets - oldcnt) * mss,
- mss, GFP_ATOMIC);
- if (err < 0)
+ /* If needed, chop off the prefix to mark as lost. */
+ lost = (packets - oldcnt) * mss;
+ if (lost < skb->len &&
+ tcp_fragment(sk, skb, lost, mss, GFP_ATOMIC) < 0)
break;
cnt = packets;
}
@@ -2366,8 +2376,6 @@ static void tcp_undo_cwnd_reduction(struct sock *sk, bool unmark_loss)
tp->snd_ssthresh = tp->prior_ssthresh;
tcp_ecn_withdraw_cwr(tp);
}
- } else {
- tp->snd_cwnd = max(tp->snd_cwnd, tp->snd_ssthresh);
}
tp->snd_cwnd_stamp = tcp_time_stamp;
tp->undo_marker = 0;
@@ -2469,14 +2477,15 @@ static void tcp_init_cwnd_reduction(struct sock *sk)
tcp_ecn_queue_cwr(tp);
}
-static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
- int fast_rexmit, int flag)
+static void tcp_cwnd_reduction(struct sock *sk, int newly_acked_sacked,
+ int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
int sndcnt = 0;
int delta = tp->snd_ssthresh - tcp_packets_in_flight(tp);
- int newly_acked_sacked = prior_unsacked -
- (tp->packets_out - tp->sacked_out);
+
+ if (newly_acked_sacked <= 0 || WARN_ON_ONCE(!tp->prior_cwnd))
+ return;
tp->prr_delivered += newly_acked_sacked;
if (delta < 0) {
@@ -2491,7 +2500,8 @@ static void tcp_cwnd_reduction(struct sock *sk, const int prior_unsacked,
} else {
sndcnt = min(delta, newly_acked_sacked);
}
- sndcnt = max(sndcnt, (fast_rexmit ? 1 : 0));
+ /* Force a fast retransmit upon entering fast recovery */
+ sndcnt = max(sndcnt, (tp->prr_out ? 0 : 1));
tp->snd_cwnd = tcp_packets_in_flight(tp) + sndcnt;
}
@@ -2536,7 +2546,7 @@ static void tcp_try_keep_open(struct sock *sk)
}
}
-static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
+static void tcp_try_to_open(struct sock *sk, int flag)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2550,8 +2560,6 @@ static void tcp_try_to_open(struct sock *sk, int flag, const int prior_unsacked)
if (inet_csk(sk)->icsk_ca_state != TCP_CA_CWR) {
tcp_try_keep_open(sk);
- } else {
- tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
}
}
@@ -2661,7 +2669,8 @@ static void tcp_enter_recovery(struct sock *sk, bool ece_ack)
/* Process an ACK in CA_Loss state. Move to CA_Open if lost data are
* recovered or spurious. Otherwise retransmits more on partial ACKs.
*/
-static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
+static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack,
+ int *rexmit)
{
struct tcp_sock *tp = tcp_sk(sk);
bool recovered = !before(tp->snd_una, tp->high_seq);
@@ -2683,10 +2692,15 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
tp->frto = 0; /* Step 3.a. loss was real */
} else if (flag & FLAG_SND_UNA_ADVANCED && !recovered) {
tp->high_seq = tp->snd_nxt;
- __tcp_push_pending_frames(sk, tcp_current_mss(sk),
- TCP_NAGLE_OFF);
- if (after(tp->snd_nxt, tp->high_seq))
- return; /* Step 2.b */
+ /* Step 2.b. Try send new data (but deferred until cwnd
+ * is updated in tcp_ack()). Otherwise fall back to
+ * the conventional recovery.
+ */
+ if (tcp_send_head(sk) &&
+ after(tcp_wnd_end(tp), tp->snd_nxt)) {
+ *rexmit = REXMIT_NEW;
+ return;
+ }
tp->frto = 0;
}
}
@@ -2705,12 +2719,11 @@ static void tcp_process_loss(struct sock *sk, int flag, bool is_dupack)
else if (flag & FLAG_SND_UNA_ADVANCED)
tcp_reset_reno_sack(tp);
}
- tcp_xmit_retransmit_queue(sk);
+ *rexmit = REXMIT_LOST;
}
/* Undo during fast recovery after partial ACK. */
-static bool tcp_try_undo_partial(struct sock *sk, const int acked,
- const int prior_unsacked, int flag)
+static bool tcp_try_undo_partial(struct sock *sk, const int acked)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -2725,10 +2738,8 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,
* can undo. Otherwise we clock out new packets but do not
* mark more packets lost or retransmit more.
*/
- if (tp->retrans_out) {
- tcp_cwnd_reduction(sk, prior_unsacked, 0, flag);
+ if (tp->retrans_out)
return true;
- }
if (!tcp_any_retrans_done(sk))
tp->retrans_stamp = 0;
@@ -2747,21 +2758,21 @@ static bool tcp_try_undo_partial(struct sock *sk, const int acked,
* taking into account both packets sitting in receiver's buffer and
* packets lost by network.
*
- * Besides that it does CWND reduction, when packet loss is detected
- * and changes state of machine.
+ * Besides that it updates the congestion state when packet loss or ECN
+ * is detected. But it does not reduce the cwnd, it is done by the
+ * congestion control later.
*
* It does _not_ decide what to send, it is made in function
* tcp_xmit_retransmit_queue().
*/
static void tcp_fastretrans_alert(struct sock *sk, const int acked,
- const int prior_unsacked,
- bool is_dupack, int flag)
+ bool is_dupack, int *ack_flag, int *rexmit)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ int fast_rexmit = 0, flag = *ack_flag;
bool do_lost = is_dupack || ((flag & FLAG_DATA_SACKED) &&
(tcp_fackets_out(tp) > tp->reordering));
- int fast_rexmit = 0;
if (WARN_ON(!tp->packets_out && tp->sacked_out))
tp->sacked_out = 0;
@@ -2808,8 +2819,10 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
/* Use RACK to detect loss */
if (sysctl_tcp_recovery & TCP_RACK_LOST_RETRANS &&
- tcp_rack_mark_lost(sk))
+ tcp_rack_mark_lost(sk)) {
flag |= FLAG_LOST_RETRANS;
+ *ack_flag |= FLAG_LOST_RETRANS;
+ }
/* E. Process state. */
switch (icsk->icsk_ca_state) {
@@ -2818,7 +2831,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (tcp_is_reno(tp) && is_dupack)
tcp_add_reno_sack(sk);
} else {
- if (tcp_try_undo_partial(sk, acked, prior_unsacked, flag))
+ if (tcp_try_undo_partial(sk, acked))
return;
/* Partial ACK arrived. Force fast retransmit. */
do_lost = tcp_is_reno(tp) ||
@@ -2830,7 +2843,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
}
break;
case TCP_CA_Loss:
- tcp_process_loss(sk, flag, is_dupack);
+ tcp_process_loss(sk, flag, is_dupack, rexmit);
if (icsk->icsk_ca_state != TCP_CA_Open &&
!(flag & FLAG_LOST_RETRANS))
return;
@@ -2847,7 +2860,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
tcp_try_undo_dsack(sk);
if (!tcp_time_to_recover(sk, flag)) {
- tcp_try_to_open(sk, flag, prior_unsacked);
+ tcp_try_to_open(sk, flag);
return;
}
@@ -2869,8 +2882,7 @@ static void tcp_fastretrans_alert(struct sock *sk, const int acked,
if (do_lost)
tcp_update_scoreboard(sk, fast_rexmit);
- tcp_cwnd_reduction(sk, prior_unsacked, fast_rexmit, flag);
- tcp_xmit_retransmit_queue(sk);
+ *rexmit = REXMIT_LOST;
}
/* Kathleen Nichols' algorithm for tracking the minimum value of
@@ -2895,7 +2907,10 @@ static void tcp_update_rtt_min(struct sock *sk, u32 rtt_us)
{
const u32 now = tcp_time_stamp, wlen = sysctl_tcp_min_rtt_wlen * HZ;
struct rtt_meas *m = tcp_sk(sk)->rtt_min;
- struct rtt_meas rttm = { .rtt = (rtt_us ? : 1), .ts = now };
+ struct rtt_meas rttm = {
+ .rtt = likely(rtt_us) ? rtt_us : jiffies_to_usecs(1),
+ .ts = now,
+ };
u32 elapsed;
/* Check if the new measurement updates the 1st, 2nd, or 3rd choices */
@@ -3092,7 +3107,7 @@ static void tcp_ack_tstamp(struct sock *sk, struct sk_buff *skb,
* arrived at the other end.
*/
static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
- u32 prior_snd_una,
+ u32 prior_snd_una, int *acked,
struct tcp_sacktag_state *sack)
{
const struct inet_connection_sock *icsk = inet_csk(sk);
@@ -3150,10 +3165,13 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
flag |= FLAG_ORIG_SACK_ACKED;
}
- if (sacked & TCPCB_SACKED_ACKED)
+ if (sacked & TCPCB_SACKED_ACKED) {
tp->sacked_out -= acked_pcount;
- else if (tcp_is_sack(tp) && !tcp_skb_spurious_retrans(tp, skb))
- tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+ } else if (tcp_is_sack(tp)) {
+ tp->delivered += acked_pcount;
+ if (!tcp_skb_spurious_retrans(tp, skb))
+ tcp_rack_advance(tp, &skb->skb_mstamp, sacked);
+ }
if (sacked & TCPCB_LOST)
tp->lost_out -= acked_pcount;
@@ -3262,6 +3280,7 @@ static int tcp_clean_rtx_queue(struct sock *sk, int prior_fackets,
}
}
#endif
+ *acked = pkts_acked;
return flag;
}
@@ -3295,21 +3314,36 @@ static inline bool tcp_ack_is_dubious(const struct sock *sk, const int flag)
/* Decide wheather to run the increase function of congestion control. */
static inline bool tcp_may_raise_cwnd(const struct sock *sk, const int flag)
{
- if (tcp_in_cwnd_reduction(sk))
- return false;
-
/* If reordering is high then always grow cwnd whenever data is
* delivered regardless of its ordering. Otherwise stay conservative
* and only grow cwnd on in-order delivery (RFC5681). A stretched ACK w/
* new SACK or ECE mark may first advance cwnd here and later reduce
* cwnd in tcp_fastretrans_alert() based on more states.
*/
- if (tcp_sk(sk)->reordering > sysctl_tcp_reordering)
+ if (tcp_sk(sk)->reordering > sock_net(sk)->ipv4.sysctl_tcp_reordering)
return flag & FLAG_FORWARD_PROGRESS;
return flag & FLAG_DATA_ACKED;
}
+/* The "ultimate" congestion control function that aims to replace the rigid
+ * cwnd increase and decrease control (tcp_cong_avoid,tcp_*cwnd_reduction).
+ * It's called toward the end of processing an ACK with precise rate
+ * information. All transmission or retransmission are delayed afterwards.
+ */
+static void tcp_cong_control(struct sock *sk, u32 ack, u32 acked_sacked,
+ int flag)
+{
+ if (tcp_in_cwnd_reduction(sk)) {
+ /* Reduce cwnd if state mandates */
+ tcp_cwnd_reduction(sk, acked_sacked, flag);
+ } else if (tcp_may_raise_cwnd(sk, flag)) {
+ /* Advance cwnd if state allows */
+ tcp_cong_avoid(sk, ack, acked_sacked);
+ }
+ tcp_update_pacing_rate(sk);
+}
+
/* Check that window update is acceptable.
* The function assumes that snd_una<=ack<=snd_next.
*/
@@ -3505,6 +3539,27 @@ static inline void tcp_in_ack_event(struct sock *sk, u32 flags)
icsk->icsk_ca_ops->in_ack_event(sk, flags);
}
+/* Congestion control has updated the cwnd already. So if we're in
+ * loss recovery then now we do any new sends (for FRTO) or
+ * retransmits (for CA_Loss or CA_recovery) that make sense.
+ */
+static void tcp_xmit_recovery(struct sock *sk, int rexmit)
+{
+ struct tcp_sock *tp = tcp_sk(sk);
+
+ if (rexmit == REXMIT_NONE)
+ return;
+
+ if (unlikely(rexmit == 2)) {
+ __tcp_push_pending_frames(sk, tcp_current_mss(sk),
+ TCP_NAGLE_OFF);
+ if (after(tp->snd_nxt, tp->high_seq))
+ return;
+ tp->frto = 0;
+ }
+ tcp_xmit_retransmit_queue(sk);
+}
+
/* This routine deals with incoming acks, but not outgoing ones. */
static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
{
@@ -3517,8 +3572,9 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
bool is_dupack = false;
u32 prior_fackets;
int prior_packets = tp->packets_out;
- const int prior_unsacked = tp->packets_out - tp->sacked_out;
+ u32 prior_delivered = tp->delivered;
int acked = 0; /* Number of packets newly acked */
+ int rexmit = REXMIT_NONE; /* Flag to (re)transmit to recover losses */
sack_state.first_sackt.v64 = 0;
@@ -3607,23 +3663,16 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
goto no_queue;
/* See if we can take anything off of the retransmit queue. */
- acked = tp->packets_out;
- flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una,
+ flag |= tcp_clean_rtx_queue(sk, prior_fackets, prior_snd_una, &acked,
&sack_state);
- acked -= tp->packets_out;
if (tcp_ack_is_dubious(sk, flag)) {
is_dupack = !(flag & (FLAG_SND_UNA_ADVANCED | FLAG_NOT_DUP));
- tcp_fastretrans_alert(sk, acked, prior_unsacked,
- is_dupack, flag);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
}
if (tp->tlp_high_seq)
tcp_process_tlp_ack(sk, ack, flag);
- /* Advance cwnd if state allows */
- if (tcp_may_raise_cwnd(sk, flag))
- tcp_cong_avoid(sk, ack, acked);
-
if ((flag & FLAG_FORWARD_PROGRESS) || !(flag & FLAG_NOT_DUP)) {
struct dst_entry *dst = __sk_dst_get(sk);
if (dst)
@@ -3632,14 +3681,14 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag)
if (icsk->icsk_pending == ICSK_TIME_RETRANS)
tcp_schedule_loss_probe(sk);
- tcp_update_pacing_rate(sk);
+ tcp_cong_control(sk, ack, tp->delivered - prior_delivered, flag);
+ tcp_xmit_recovery(sk, rexmit);
return 1;
no_queue:
/* If data was DSACKed, see if we can undo a cwnd reduction. */
if (flag & FLAG_DSACKING_ACK)
- tcp_fastretrans_alert(sk, acked, prior_unsacked,
- is_dupack, flag);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
/* If this ack opens up a zero window, clear backoff. It was
* being used to time the probes, and is probably far higher than
* it needs to be for normal retransmission.
@@ -3662,8 +3711,8 @@ old_ack:
if (TCP_SKB_CB(skb)->sacked) {
flag |= tcp_sacktag_write_queue(sk, skb, prior_snd_una,
&sack_state);
- tcp_fastretrans_alert(sk, acked, prior_unsacked,
- is_dupack, flag);
+ tcp_fastretrans_alert(sk, acked, is_dupack, &flag, &rexmit);
+ tcp_xmit_recovery(sk, rexmit);
}
SOCK_DEBUG(sk, "Ack %u before %u:%u\n", ack, tp->snd_una, tp->snd_nxt);
@@ -3994,7 +4043,7 @@ void tcp_reset(struct sock *sk)
*
* If we are in FINWAIT-2, a received FIN moves us to TIME-WAIT.
*/
-static void tcp_fin(struct sock *sk)
+void tcp_fin(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
@@ -4481,19 +4530,34 @@ static int __must_check tcp_queue_rcv(struct sock *sk, struct sk_buff *skb, int
int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
{
struct sk_buff *skb;
+ int err = -ENOMEM;
+ int data_len = 0;
bool fragstolen;
if (size == 0)
return 0;
- skb = alloc_skb(size, sk->sk_allocation);
+ if (size > PAGE_SIZE) {
+ int npages = min_t(size_t, size >> PAGE_SHIFT, MAX_SKB_FRAGS);
+
+ data_len = npages << PAGE_SHIFT;
+ size = data_len + (size & ~PAGE_MASK);
+ }
+ skb = alloc_skb_with_frags(size - data_len, data_len,
+ PAGE_ALLOC_COSTLY_ORDER,
+ &err, sk->sk_allocation);
if (!skb)
goto err;
+ skb_put(skb, size - data_len);
+ skb->data_len = data_len;
+ skb->len = size;
+
if (tcp_try_rmem_schedule(sk, skb, skb->truesize))
goto err_free;
- if (memcpy_from_msg(skb_put(skb, size), msg, size))
+ err = skb_copy_datagram_from_iter(skb, 0, &msg->msg_iter, size);
+ if (err)
goto err_free;
TCP_SKB_CB(skb)->seq = tcp_sk(sk)->rcv_nxt;
@@ -4509,7 +4573,8 @@ int tcp_send_rcvq(struct sock *sk, struct msghdr *msg, size_t size)
err_free:
kfree_skb(skb);
err:
- return -ENOMEM;
+ return err;
+
}
static void tcp_data_queue(struct sock *sk, struct sk_buff *skb)
@@ -5492,6 +5557,9 @@ static bool tcp_rcv_fastopen_synack(struct sock *sk, struct sk_buff *synack,
tp->syn_data_acked = tp->syn_data;
if (tp->syn_data_acked)
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPFASTOPENACTIVE);
+
+ tcp_fastopen_add_skb(sk, synack);
+
return false;
}
@@ -5667,6 +5735,7 @@ discard:
}
tp->rcv_nxt = TCP_SKB_CB(skb)->seq + 1;
+ tp->copied_seq = tp->rcv_nxt;
tp->rcv_wup = TCP_SKB_CB(skb)->seq + 1;
/* RFC1323: The window in SYN & SYN/ACK segments is
@@ -6097,9 +6166,10 @@ static bool tcp_syn_flood_action(const struct sock *sk,
struct request_sock_queue *queue = &inet_csk(sk)->icsk_accept_queue;
const char *msg = "Dropping request";
bool want_cookie = false;
+ struct net *net = sock_net(sk);
#ifdef CONFIG_SYN_COOKIES
- if (sysctl_tcp_syncookies) {
+ if (net->ipv4.sysctl_tcp_syncookies) {
msg = "Sending cookies";
want_cookie = true;
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDOCOOKIES);
@@ -6108,7 +6178,7 @@ static bool tcp_syn_flood_action(const struct sock *sk,
NET_INC_STATS_BH(sock_net(sk), LINUX_MIB_TCPREQQFULLDROP);
if (!queue->synflood_warned &&
- sysctl_tcp_syncookies != 2 &&
+ net->ipv4.sysctl_tcp_syncookies != 2 &&
xchg(&queue->synflood_warned, 1) == 0)
pr_info("%s: Possible SYN flooding on port %d. %s. Check SNMP counters.\n",
proto, ntohs(tcp_hdr(skb)->dest), msg);
@@ -6141,6 +6211,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
__u32 isn = TCP_SKB_CB(skb)->tcp_tw_isn;
struct tcp_options_received tmp_opt;
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct sock *fastopen_sk = NULL;
struct dst_entry *dst = NULL;
struct request_sock *req;
@@ -6151,7 +6222,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
* limitations, they conserve resources and peer is
* evidently real one.
*/
- if ((sysctl_tcp_syncookies == 2 ||
+ if ((net->ipv4.sysctl_tcp_syncookies == 2 ||
inet_csk_reqsk_queue_is_full(sk)) && !isn) {
want_cookie = tcp_syn_flood_action(sk, skb, rsk_ops->slab_name);
if (!want_cookie)
@@ -6187,7 +6258,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_openreq_init(req, &tmp_opt, skb, sk);
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
- inet_rsk(req)->ir_iif = sk->sk_bound_dev_if;
+ inet_rsk(req)->ir_iif = inet_request_bound_dev_if(sk, skb);
af_ops->init_req(req, sk, skb);
@@ -6217,7 +6288,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
}
}
/* Kill the following clause, if you dislike this way. */
- else if (!sysctl_tcp_syncookies &&
+ else if (!net->ipv4.sysctl_tcp_syncookies &&
(sysctl_max_syn_backlog - inet_csk_reqsk_queue_len(sk) <
(sysctl_max_syn_backlog >> 2)) &&
!tcp_peer_is_proven(req, dst, false,
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 1c2648bbac4b..ad450509029b 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -73,7 +73,6 @@
#include <net/timewait_sock.h>
#include <net/xfrm.h>
#include <net/secure_seq.h>
-#include <net/tcp_memcontrol.h>
#include <net/busy_poll.h>
#include <linux/inet.h>
@@ -82,7 +81,7 @@
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <linux/crypto.h>
+#include <crypto/hash.h>
#include <linux/scatterlist.h>
int sysctl_tcp_tw_reuse __read_mostly;
@@ -312,7 +311,7 @@ static void do_redirect(struct sk_buff *skb, struct sock *sk)
/* handle ICMP messages on TCP_NEW_SYN_RECV request sockets */
-void tcp_req_err(struct sock *sk, u32 seq)
+void tcp_req_err(struct sock *sk, u32 seq, bool abort)
{
struct request_sock *req = inet_reqsk(sk);
struct net *net = sock_net(sk);
@@ -320,11 +319,9 @@ void tcp_req_err(struct sock *sk, u32 seq)
/* ICMPs are not backlogged, hence we cannot get
* an established socket here.
*/
- WARN_ON(req->sk);
-
if (seq != tcp_rsk(req)->snt_isn) {
NET_INC_STATS_BH(net, LINUX_MIB_OUTOFWINDOWICMPS);
- } else {
+ } else if (abort) {
/*
* Still in SYN_RECV, just remove it silently.
* There is no good way to pass the error to the newly
@@ -384,7 +381,12 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
}
seq = ntohl(th->seq);
if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq);
+ return tcp_req_err(sk, seq,
+ type == ICMP_PARAMETERPROB ||
+ type == ICMP_TIME_EXCEEDED ||
+ (type == ICMP_DEST_UNREACH &&
+ (code == ICMP_NET_UNREACH ||
+ code == ICMP_HOST_UNREACH)));
bh_lock_sock(sk);
/* If too many ICMPs get dropped on busy
@@ -587,7 +589,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
} rep;
struct ip_reply_arg arg;
#ifdef CONFIG_TCP_MD5SIG
- struct tcp_md5sig_key *key;
+ struct tcp_md5sig_key *key = NULL;
const __u8 *hash_location = NULL;
unsigned char newhash[16];
int genhash;
@@ -627,7 +629,10 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
net = sk ? sock_net(sk) : dev_net(skb_dst(skb)->dev);
#ifdef CONFIG_TCP_MD5SIG
hash_location = tcp_parse_md5sig_option(th);
- if (!sk && hash_location) {
+ if (sk && sk_fullsock(sk)) {
+ key = tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
+ &ip_hdr(skb)->saddr, AF_INET);
+ } else if (hash_location) {
/*
* active side is lost. Try to find listening socket through
* source port, and then find md5 key through listening socket.
@@ -635,8 +640,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
* Incoming packet is checked with md5 hash with finding key,
* no RST generated if md5 hash doesn't match.
*/
- sk1 = __inet_lookup_listener(net,
- &tcp_hashinfo, ip_hdr(skb)->saddr,
+ sk1 = __inet_lookup_listener(net, &tcp_hashinfo, NULL, 0,
+ ip_hdr(skb)->saddr,
th->source, ip_hdr(skb)->daddr,
ntohs(th->source), inet_iif(skb));
/* don't send rst if it can't find key */
@@ -651,10 +656,6 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
genhash = tcp_v4_md5_hash_skb(newhash, key, NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0)
goto release_sk1;
- } else {
- key = sk ? tcp_md5_do_lookup(sk, (union tcp_md5_addr *)
- &ip_hdr(skb)->saddr,
- AF_INET) : NULL;
}
if (key) {
@@ -675,7 +676,8 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
ip_hdr(skb)->saddr, /* XXX */
arg.iov[0].iov_len, IPPROTO_TCP, 0);
arg.csumoffset = offsetof(struct tcphdr, check) / 2;
- arg.flags = (sk && inet_sk(sk)->transparent) ? IP_REPLY_ARG_NOSRCCHECK : 0;
+ arg.flags = (sk && inet_sk_transparent(sk)) ? IP_REPLY_ARG_NOSRCCHECK : 0;
+
/* When socket is gone, all binding information is lost.
* routing might fail in this case. No choice here, if we choose to force
* input interface, we will misroute in case of asymmetric route.
@@ -683,6 +685,9 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
if (sk)
arg.bound_dev_if = sk->sk_bound_dev_if;
+ BUILD_BUG_ON(offsetof(struct sock, sk_bound_dev_if) !=
+ offsetof(struct inet_timewait_sock, tw_bound_dev_if));
+
arg.tos = ip_hdr(skb)->tos;
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
@@ -705,7 +710,8 @@ release_sk1:
outside socket context is ugly, certainly. What can I do?
*/
-static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
+static void tcp_v4_send_ack(struct net *net,
+ struct sk_buff *skb, u32 seq, u32 ack,
u32 win, u32 tsval, u32 tsecr, int oif,
struct tcp_md5sig_key *key,
int reply_flags, u8 tos)
@@ -720,7 +726,6 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack,
];
} rep;
struct ip_reply_arg arg;
- struct net *net = dev_net(skb_dst(skb)->dev);
memset(&rep.th, 0, sizeof(struct tcphdr));
memset(&arg, 0, sizeof(arg));
@@ -782,7 +787,8 @@ static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
struct inet_timewait_sock *tw = inet_twsk(sk);
struct tcp_timewait_sock *tcptw = tcp_twsk(sk);
- tcp_v4_send_ack(skb, tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
+ tcp_v4_send_ack(sock_net(sk), skb,
+ tcptw->tw_snd_nxt, tcptw->tw_rcv_nxt,
tcptw->tw_rcv_wnd >> tw->tw_rcv_wscale,
tcp_time_stamp + tcptw->tw_ts_offset,
tcptw->tw_ts_recent,
@@ -801,8 +807,10 @@ static void tcp_v4_reqsk_send_ack(const struct sock *sk, struct sk_buff *skb,
/* sk->sk_state == TCP_LISTEN -> for regular TCP_SYN_RECV
* sk->sk_state == TCP_SYN_RECV -> for Fast Open.
*/
- tcp_v4_send_ack(skb, (sk->sk_state == TCP_LISTEN) ?
- tcp_rsk(req)->snt_isn + 1 : tcp_sk(sk)->snd_nxt,
+ u32 seq = (sk->sk_state == TCP_LISTEN) ? tcp_rsk(req)->snt_isn + 1 :
+ tcp_sk(sk)->snd_nxt;
+
+ tcp_v4_send_ack(sock_net(sk), skb, seq,
tcp_rsk(req)->rcv_nxt, req->rsk_rcv_wnd,
tcp_time_stamp,
req->ts_recent,
@@ -855,7 +863,6 @@ static void tcp_v4_reqsk_destructor(struct request_sock *req)
kfree(inet_rsk(req)->opt);
}
-
#ifdef CONFIG_TCP_MD5SIG
/*
* RFC2385 MD5 checksumming requires a mapping of
@@ -921,7 +928,8 @@ int tcp_md5_do_add(struct sock *sk, const union tcp_md5_addr *addr,
}
md5sig = rcu_dereference_protected(tp->md5sig_info,
- sock_owned_by_user(sk));
+ sock_owned_by_user(sk) ||
+ lockdep_is_held(&sk->sk_lock.slock));
if (!md5sig) {
md5sig = kmalloc(sizeof(*md5sig), gfp);
if (!md5sig)
@@ -1028,21 +1036,22 @@ static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
bp->len = cpu_to_be16(nbytes);
sg_init_one(&sg, bp, sizeof(*bp));
- return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
+ ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
+ return crypto_ahash_update(hp->md5_req);
}
static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
__be32 daddr, __be32 saddr, const struct tcphdr *th)
{
struct tcp_md5sig_pool *hp;
- struct hash_desc *desc;
+ struct ahash_request *req;
hp = tcp_get_md5sig_pool();
if (!hp)
goto clear_hash_noput;
- desc = &hp->md5_desc;
+ req = hp->md5_req;
- if (crypto_hash_init(desc))
+ if (crypto_ahash_init(req))
goto clear_hash;
if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
goto clear_hash;
@@ -1050,7 +1059,8 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const struct tcp_md5sig_key *key,
goto clear_hash;
if (tcp_md5_hash_key(hp, key))
goto clear_hash;
- if (crypto_hash_final(desc, md5_hash))
+ ahash_request_set_crypt(req, NULL, md5_hash, 0);
+ if (crypto_ahash_final(req))
goto clear_hash;
tcp_put_md5sig_pool();
@@ -1068,7 +1078,7 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
const struct sk_buff *skb)
{
struct tcp_md5sig_pool *hp;
- struct hash_desc *desc;
+ struct ahash_request *req;
const struct tcphdr *th = tcp_hdr(skb);
__be32 saddr, daddr;
@@ -1084,9 +1094,9 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
hp = tcp_get_md5sig_pool();
if (!hp)
goto clear_hash_noput;
- desc = &hp->md5_desc;
+ req = hp->md5_req;
- if (crypto_hash_init(desc))
+ if (crypto_ahash_init(req))
goto clear_hash;
if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
@@ -1097,7 +1107,8 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct tcp_md5sig_key *key,
goto clear_hash;
if (tcp_md5_hash_key(hp, key))
goto clear_hash;
- if (crypto_hash_final(desc, md5_hash))
+ ahash_request_set_crypt(req, NULL, md5_hash, 0);
+ if (crypto_ahash_final(req))
goto clear_hash;
tcp_put_md5sig_pool();
@@ -1275,6 +1286,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
ireq = inet_rsk(req);
sk_daddr_set(newsk, ireq->ir_rmt_addr);
sk_rcv_saddr_set(newsk, ireq->ir_loc_addr);
+ newsk->sk_bound_dev_if = ireq->ir_iif;
newinet->inet_saddr = ireq->ir_loc_addr;
inet_opt = ireq->opt;
rcu_assign_pointer(newinet->inet_opt, inet_opt);
@@ -1326,6 +1338,8 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
if (__inet_inherit_port(sk, newsk) < 0)
goto put_and_exit;
*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
+ if (*own_req)
+ tcp_move_syn(newtp, req);
return newsk;
@@ -1490,7 +1504,7 @@ bool tcp_prequeue(struct sock *sk, struct sk_buff *skb)
if (likely(sk->sk_rx_dst))
skb_dst_drop(skb);
else
- skb_dst_force(skb);
+ skb_dst_force_safe(skb);
__skb_queue_tail(&tp->ucopy.prequeue, skb);
tp->ucopy.memory += skb->truesize;
@@ -1573,7 +1587,8 @@ int tcp_v4_rcv(struct sk_buff *skb)
TCP_SKB_CB(skb)->sacked = 0;
lookup:
- sk = __inet_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest);
+ sk = __inet_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th), th->source,
+ th->dest);
if (!sk)
goto no_tcp_socket;
@@ -1583,28 +1598,30 @@ process:
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
- struct sock *nsk = NULL;
+ struct sock *nsk;
sk = req->rsk_listener;
- if (tcp_v4_inbound_md5_hash(sk, skb))
- goto discard_and_relse;
- if (likely(sk->sk_state == TCP_LISTEN)) {
- nsk = tcp_check_req(sk, skb, req, false);
- } else {
+ if (unlikely(tcp_v4_inbound_md5_hash(sk, skb))) {
+ reqsk_put(req);
+ goto discard_it;
+ }
+ if (unlikely(sk->sk_state != TCP_LISTEN)) {
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
+ sock_hold(sk);
+ nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
- goto discard_it;
+ goto discard_and_relse;
}
if (nsk == sk) {
- sock_hold(sk);
reqsk_put(req);
} else if (tcp_child_process(sk, nsk, skb)) {
tcp_v4_send_reset(nsk, skb);
- goto discard_it;
+ goto discard_and_relse;
} else {
+ sock_put(sk);
return 0;
}
}
@@ -1634,7 +1651,7 @@ process:
sk_incoming_cpu_update(sk);
bh_lock_sock_nested(sk);
- tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+ tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
if (!tcp_prequeue(sk, skb))
@@ -1687,7 +1704,8 @@ do_time_wait:
switch (tcp_timewait_state_process(inet_twsk(sk), skb, th)) {
case TCP_TW_SYN: {
struct sock *sk2 = inet_lookup_listener(dev_net(skb->dev),
- &tcp_hashinfo,
+ &tcp_hashinfo, skb,
+ __tcp_hdrlen(th),
iph->saddr, th->source,
iph->daddr, th->dest,
inet_iif(skb));
@@ -1702,7 +1720,9 @@ do_time_wait:
tcp_v4_timewait_ack(sk, skb);
break;
case TCP_TW_RST:
- goto no_tcp_socket;
+ tcp_v4_send_reset(sk, skb);
+ inet_twsk_deschedule_put(inet_twsk(sk));
+ goto discard_it;
case TCP_TW_SUCCESS:;
}
goto discard_it;
@@ -1718,8 +1738,7 @@ void inet_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- if (dst) {
- dst_hold(dst);
+ if (dst && dst_hold_safe(dst)) {
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
}
@@ -1810,7 +1829,9 @@ void tcp_v4_destroy_sock(struct sock *sk)
tcp_saved_syn_free(tp);
sk_sockets_allocated_dec(sk);
- sock_release_memcg(sk);
+
+ if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+ sock_release_memcg(sk);
}
EXPORT_SYMBOL(tcp_v4_destroy_sock);
@@ -2156,6 +2177,7 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
__u16 destp = ntohs(inet->inet_dport);
__u16 srcp = ntohs(inet->inet_sport);
int rx_queue;
+ int state;
if (icsk->icsk_pending == ICSK_TIME_RETRANS ||
icsk->icsk_pending == ICSK_TIME_EARLY_RETRANS ||
@@ -2173,17 +2195,18 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
timer_expires = jiffies;
}
- if (sk->sk_state == TCP_LISTEN)
+ state = sk_state_load(sk);
+ if (state == TCP_LISTEN)
rx_queue = sk->sk_ack_backlog;
else
- /*
- * because we dont lock socket, we might find a transient negative value
+ /* Because we don't lock the socket,
+ * we might find a transient negative value.
*/
rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
seq_printf(f, "%4d: %08X:%04X %08X:%04X %02X %08X:%08X %02X:%08lX "
"%08X %5u %8d %lu %d %pK %lu %lu %u %u %d",
- i, src, srcp, dest, destp, sk->sk_state,
+ i, src, srcp, dest, destp, state,
tp->write_seq - tp->snd_una,
rx_queue,
timer_active,
@@ -2197,8 +2220,8 @@ static void get_tcp4_sock(struct sock *sk, struct seq_file *f, int i)
jiffies_to_clock_t(icsk->icsk_ack.ato),
(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
tp->snd_cwnd,
- sk->sk_state == TCP_LISTEN ?
- (fastopenq ? fastopenq->max_qlen : 0) :
+ state == TCP_LISTEN ?
+ fastopenq->max_qlen :
(tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh));
}
@@ -2332,11 +2355,7 @@ struct proto tcp_prot = {
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
-#ifdef CONFIG_MEMCG_KMEM
- .init_cgroup = tcp_init_cgroup,
- .destroy_cgroup = tcp_destroy_cgroup,
- .proto_cgroup = tcp_proto_cgroup,
-#endif
+ .diag_destroy = tcp_abort,
};
EXPORT_SYMBOL(tcp_prot);
@@ -2374,6 +2393,20 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_probe_threshold = TCP_PROBE_THRESHOLD;
net->ipv4.sysctl_tcp_probe_interval = TCP_PROBE_INTERVAL;
+ net->ipv4.sysctl_tcp_keepalive_time = TCP_KEEPALIVE_TIME;
+ net->ipv4.sysctl_tcp_keepalive_probes = TCP_KEEPALIVE_PROBES;
+ net->ipv4.sysctl_tcp_keepalive_intvl = TCP_KEEPALIVE_INTVL;
+
+ net->ipv4.sysctl_tcp_syn_retries = TCP_SYN_RETRIES;
+ net->ipv4.sysctl_tcp_synack_retries = TCP_SYNACK_RETRIES;
+ net->ipv4.sysctl_tcp_syncookies = 1;
+ net->ipv4.sysctl_tcp_reordering = TCP_FASTRETRANS_THRESH;
+ net->ipv4.sysctl_tcp_retries1 = TCP_RETR1;
+ net->ipv4.sysctl_tcp_retries2 = TCP_RETR2;
+ net->ipv4.sysctl_tcp_orphan_retries = 0;
+ net->ipv4.sysctl_tcp_fin_timeout = TCP_FIN_TIMEOUT;
+ net->ipv4.sysctl_tcp_notsent_lowat = UINT_MAX;
+
return 0;
fail:
tcp_sk_exit(net);
diff --git a/net/ipv4/tcp_memcontrol.c b/net/ipv4/tcp_memcontrol.c
deleted file mode 100644
index 2379c1b4efb2..000000000000
--- a/net/ipv4/tcp_memcontrol.c
+++ /dev/null
@@ -1,233 +0,0 @@
-#include <net/tcp.h>
-#include <net/tcp_memcontrol.h>
-#include <net/sock.h>
-#include <net/ip.h>
-#include <linux/nsproxy.h>
-#include <linux/memcontrol.h>
-#include <linux/module.h>
-
-int tcp_init_cgroup(struct mem_cgroup *memcg, struct cgroup_subsys *ss)
-{
- /*
- * The root cgroup does not use page_counters, but rather,
- * rely on the data already collected by the network
- * subsystem
- */
- struct mem_cgroup *parent = parent_mem_cgroup(memcg);
- struct page_counter *counter_parent = NULL;
- struct cg_proto *cg_proto, *parent_cg;
-
- cg_proto = tcp_prot.proto_cgroup(memcg);
- if (!cg_proto)
- return 0;
-
- cg_proto->sysctl_mem[0] = sysctl_tcp_mem[0];
- cg_proto->sysctl_mem[1] = sysctl_tcp_mem[1];
- cg_proto->sysctl_mem[2] = sysctl_tcp_mem[2];
- cg_proto->memory_pressure = 0;
- cg_proto->memcg = memcg;
-
- parent_cg = tcp_prot.proto_cgroup(parent);
- if (parent_cg)
- counter_parent = &parent_cg->memory_allocated;
-
- page_counter_init(&cg_proto->memory_allocated, counter_parent);
- percpu_counter_init(&cg_proto->sockets_allocated, 0, GFP_KERNEL);
-
- return 0;
-}
-EXPORT_SYMBOL(tcp_init_cgroup);
-
-void tcp_destroy_cgroup(struct mem_cgroup *memcg)
-{
- struct cg_proto *cg_proto;
-
- cg_proto = tcp_prot.proto_cgroup(memcg);
- if (!cg_proto)
- return;
-
- percpu_counter_destroy(&cg_proto->sockets_allocated);
-
- if (test_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
- static_key_slow_dec(&memcg_socket_limit_enabled);
-
-}
-EXPORT_SYMBOL(tcp_destroy_cgroup);
-
-static int tcp_update_limit(struct mem_cgroup *memcg, unsigned long nr_pages)
-{
- struct cg_proto *cg_proto;
- int i;
- int ret;
-
- cg_proto = tcp_prot.proto_cgroup(memcg);
- if (!cg_proto)
- return -EINVAL;
-
- ret = page_counter_limit(&cg_proto->memory_allocated, nr_pages);
- if (ret)
- return ret;
-
- for (i = 0; i < 3; i++)
- cg_proto->sysctl_mem[i] = min_t(long, nr_pages,
- sysctl_tcp_mem[i]);
-
- if (nr_pages == PAGE_COUNTER_MAX)
- clear_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
- else {
- /*
- * The active bit needs to be written after the static_key
- * update. This is what guarantees that the socket activation
- * function is the last one to run. See sock_update_memcg() for
- * details, and note that we don't mark any socket as belonging
- * to this memcg until that flag is up.
- *
- * We need to do this, because static_keys will span multiple
- * sites, but we can't control their order. If we mark a socket
- * as accounted, but the accounting functions are not patched in
- * yet, we'll lose accounting.
- *
- * We never race with the readers in sock_update_memcg(),
- * because when this value change, the code to process it is not
- * patched in yet.
- *
- * The activated bit is used to guarantee that no two writers
- * will do the update in the same memcg. Without that, we can't
- * properly shutdown the static key.
- */
- if (!test_and_set_bit(MEMCG_SOCK_ACTIVATED, &cg_proto->flags))
- static_key_slow_inc(&memcg_socket_limit_enabled);
- set_bit(MEMCG_SOCK_ACTIVE, &cg_proto->flags);
- }
-
- return 0;
-}
-
-enum {
- RES_USAGE,
- RES_LIMIT,
- RES_MAX_USAGE,
- RES_FAILCNT,
-};
-
-static DEFINE_MUTEX(tcp_limit_mutex);
-
-static ssize_t tcp_cgroup_write(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
- unsigned long nr_pages;
- int ret = 0;
-
- buf = strstrip(buf);
-
- switch (of_cft(of)->private) {
- case RES_LIMIT:
- /* see memcontrol.c */
- ret = page_counter_memparse(buf, "-1", &nr_pages);
- if (ret)
- break;
- mutex_lock(&tcp_limit_mutex);
- ret = tcp_update_limit(memcg, nr_pages);
- mutex_unlock(&tcp_limit_mutex);
- break;
- default:
- ret = -EINVAL;
- break;
- }
- return ret ?: nbytes;
-}
-
-static u64 tcp_cgroup_read(struct cgroup_subsys_state *css, struct cftype *cft)
-{
- struct mem_cgroup *memcg = mem_cgroup_from_css(css);
- struct cg_proto *cg_proto = tcp_prot.proto_cgroup(memcg);
- u64 val;
-
- switch (cft->private) {
- case RES_LIMIT:
- if (!cg_proto)
- return PAGE_COUNTER_MAX;
- val = cg_proto->memory_allocated.limit;
- val *= PAGE_SIZE;
- break;
- case RES_USAGE:
- if (!cg_proto)
- val = atomic_long_read(&tcp_memory_allocated);
- else
- val = page_counter_read(&cg_proto->memory_allocated);
- val *= PAGE_SIZE;
- break;
- case RES_FAILCNT:
- if (!cg_proto)
- return 0;
- val = cg_proto->memory_allocated.failcnt;
- break;
- case RES_MAX_USAGE:
- if (!cg_proto)
- return 0;
- val = cg_proto->memory_allocated.watermark;
- val *= PAGE_SIZE;
- break;
- default:
- BUG();
- }
- return val;
-}
-
-static ssize_t tcp_cgroup_reset(struct kernfs_open_file *of,
- char *buf, size_t nbytes, loff_t off)
-{
- struct mem_cgroup *memcg;
- struct cg_proto *cg_proto;
-
- memcg = mem_cgroup_from_css(of_css(of));
- cg_proto = tcp_prot.proto_cgroup(memcg);
- if (!cg_proto)
- return nbytes;
-
- switch (of_cft(of)->private) {
- case RES_MAX_USAGE:
- page_counter_reset_watermark(&cg_proto->memory_allocated);
- break;
- case RES_FAILCNT:
- cg_proto->memory_allocated.failcnt = 0;
- break;
- }
-
- return nbytes;
-}
-
-static struct cftype tcp_files[] = {
- {
- .name = "kmem.tcp.limit_in_bytes",
- .write = tcp_cgroup_write,
- .read_u64 = tcp_cgroup_read,
- .private = RES_LIMIT,
- },
- {
- .name = "kmem.tcp.usage_in_bytes",
- .read_u64 = tcp_cgroup_read,
- .private = RES_USAGE,
- },
- {
- .name = "kmem.tcp.failcnt",
- .private = RES_FAILCNT,
- .write = tcp_cgroup_reset,
- .read_u64 = tcp_cgroup_read,
- },
- {
- .name = "kmem.tcp.max_usage_in_bytes",
- .private = RES_MAX_USAGE,
- .write = tcp_cgroup_reset,
- .read_u64 = tcp_cgroup_read,
- },
- { } /* terminate */
-};
-
-static int __init tcp_memcontrol_init(void)
-{
- WARN_ON(cgroup_add_legacy_cftypes(&memory_cgrp_subsys, tcp_files));
- return 0;
-}
-__initcall(tcp_memcontrol_init);
diff --git a/net/ipv4/tcp_metrics.c b/net/ipv4/tcp_metrics.c
index c8cbc2b4b792..7b7eec439906 100644
--- a/net/ipv4/tcp_metrics.c
+++ b/net/ipv4/tcp_metrics.c
@@ -369,6 +369,7 @@ void tcp_update_metrics(struct sock *sk)
const struct inet_connection_sock *icsk = inet_csk(sk);
struct dst_entry *dst = __sk_dst_get(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct tcp_metrics_block *tm;
unsigned long rtt;
u32 val;
@@ -473,7 +474,7 @@ void tcp_update_metrics(struct sock *sk)
if (!tcp_metric_locked(tm, TCP_METRIC_REORDERING)) {
val = tcp_metric_get(tm, TCP_METRIC_REORDERING);
if (val < tp->reordering &&
- tp->reordering != sysctl_tcp_reordering)
+ tp->reordering != net->ipv4.sysctl_tcp_reordering)
tcp_metric_set(tm, TCP_METRIC_REORDERING,
tp->reordering);
}
@@ -550,7 +551,7 @@ reset:
*/
if (crtt > tp->srtt_us) {
/* Set RTO like tcp_rtt_estimator(), but from cached RTT. */
- crtt /= 8 * USEC_PER_MSEC;
+ crtt /= 8 * USEC_PER_SEC / HZ;
inet_csk(sk)->icsk_rto = crtt + max(2 * crtt, tcp_rto_min(sk));
} else if (tp->srtt_us == 0) {
/* RFC6298: 5.7 We've failed to get a valid RTT sample from
diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
index 3575dd1e5b67..acb366dd61e6 100644
--- a/net/ipv4/tcp_minisocks.c
+++ b/net/ipv4/tcp_minisocks.c
@@ -27,9 +27,6 @@
#include <net/inet_common.h>
#include <net/xfrm.h>
-int sysctl_tcp_syncookies __read_mostly = 1;
-EXPORT_SYMBOL(sysctl_tcp_syncookies);
-
int sysctl_tcp_abort_on_overflow __read_mostly;
struct inet_timewait_death_row tcp_death_row = {
@@ -131,7 +128,7 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
goto kill;
if (th->syn && !before(TCP_SKB_CB(skb)->seq, tcptw->tw_rcv_nxt))
- goto kill_with_rst;
+ return TCP_TW_RST;
/* Dup ACK? */
if (!th->ack ||
@@ -145,11 +142,8 @@ tcp_timewait_state_process(struct inet_timewait_sock *tw, struct sk_buff *skb,
* reset.
*/
if (!th->fin ||
- TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1) {
-kill_with_rst:
- inet_twsk_deschedule_put(tw);
+ TCP_SKB_CB(skb)->end_seq != tcptw->tw_rcv_nxt + 1)
return TCP_TW_RST;
- }
/* FIN arrived, enter true time-wait state. */
tw->tw_substate = TCP_TIME_WAIT;
@@ -458,7 +452,7 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rcv_wup = newtp->copied_seq =
newtp->rcv_nxt = treq->rcv_isn + 1;
- newtp->segs_in = 0;
+ newtp->segs_in = 1;
newtp->snd_sml = newtp->snd_una =
newtp->snd_nxt = newtp->snd_up = treq->snt_isn + 1;
@@ -551,9 +545,6 @@ struct sock *tcp_create_openreq_child(const struct sock *sk,
newtp->rack.mstamp.v64 = 0;
newtp->rack.advanced = 0;
- newtp->saved_syn = req->saved_syn;
- req->saved_syn = NULL;
-
TCP_INC_STATS_BH(sock_net(sk), TCP_MIB_PASSIVEOPENS);
}
return newsk;
@@ -821,6 +812,7 @@ int tcp_child_process(struct sock *parent, struct sock *child,
int ret = 0;
int state = child->sk_state;
+ tcp_segs_in(tcp_sk(child), skb);
if (!sock_owned_by_user(child)) {
ret = tcp_rcv_state_process(child, skb);
/* Wakeup parent, send SIGIO */
diff --git a/net/ipv4/tcp_offload.c b/net/ipv4/tcp_offload.c
index 9864a2dbadce..773083b7f1e9 100644
--- a/net/ipv4/tcp_offload.c
+++ b/net/ipv4/tcp_offload.c
@@ -135,7 +135,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
th->fin = th->psh = 0;
th->check = newcheck;
- if (skb->ip_summed != CHECKSUM_PARTIAL)
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ gso_reset_checksum(skb, ~th->check);
+ else
th->check = gso_make_checksum(skb, ~th->check);
seq += mss;
@@ -169,7 +171,9 @@ struct sk_buff *tcp_gso_segment(struct sk_buff *skb,
skb->data_len);
th->check = ~csum_fold((__force __wsum)((__force u32)th->check +
(__force u32)delta));
- if (skb->ip_summed != CHECKSUM_PARTIAL)
+ if (skb->ip_summed == CHECKSUM_PARTIAL)
+ gso_reset_checksum(skb, ~th->check);
+ else
th->check = gso_make_checksum(skb, ~th->check);
out:
return segs;
diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c
index cb7ca569052c..7d2dc015cd19 100644
--- a/net/ipv4/tcp_output.c
+++ b/net/ipv4/tcp_output.c
@@ -62,9 +62,6 @@ int sysctl_tcp_tso_win_divisor __read_mostly = 3;
/* By default, RFC2861 behavior. */
int sysctl_tcp_slow_start_after_idle __read_mostly = 1;
-unsigned int sysctl_tcp_notsent_lowat __read_mostly = UINT_MAX;
-EXPORT_SYMBOL(sysctl_tcp_notsent_lowat);
-
static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
int push_one, gfp_t gfp);
@@ -1006,8 +1003,10 @@ static int tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, int clone_it,
if (likely(tcb->tcp_flags & TCPHDR_ACK))
tcp_event_ack_sent(sk, tcp_skb_pcount(skb));
- if (skb->len != tcp_header_size)
+ if (skb->len != tcp_header_size) {
tcp_event_data_sent(tp, sk);
+ tp->data_segs_out += tcp_skb_pcount(skb);
+ }
if (after(tcb->end_seq, tp->snd_nxt) || tcb->seq == tcb->end_seq)
TCP_ADD_STATS(sock_net(sk), TCP_MIB_OUTSEGS,
@@ -2296,7 +2295,7 @@ void __tcp_push_pending_frames(struct sock *sk, unsigned int cur_mss,
return;
if (tcp_write_xmit(sk, cur_mss, nonagle, 0,
- sk_gfp_atomic(sk, GFP_ATOMIC)))
+ sk_gfp_mask(sk, GFP_ATOMIC)))
tcp_check_probe_timer(sk);
}
@@ -2813,13 +2812,16 @@ begin_fwd:
*/
void sk_forced_mem_schedule(struct sock *sk, int size)
{
- int amt, status;
+ int amt;
if (size <= sk->sk_forward_alloc)
return;
amt = sk_mem_pages(size);
sk->sk_forward_alloc += amt * SK_MEM_QUANTUM;
- sk_memory_allocated_add(sk, amt, &status);
+ sk_memory_allocated_add(sk, amt);
+
+ if (mem_cgroup_sockets_enabled && sk->sk_memcg)
+ mem_cgroup_charge_skmem(sk->sk_memcg, amt);
}
/* Send a FIN. The caller locks the socket for us.
@@ -3150,7 +3152,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{
struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req;
- int syn_loss = 0, space, err = 0, copied;
+ int syn_loss = 0, space, err = 0;
unsigned long last_syn_loss = 0;
struct sk_buff *syn_data;
@@ -3188,17 +3190,18 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
goto fallback;
syn_data->ip_summed = CHECKSUM_PARTIAL;
memcpy(syn_data->cb, syn->cb, sizeof(syn->cb));
- copied = copy_from_iter(skb_put(syn_data, space), space,
- &fo->data->msg_iter);
- if (unlikely(!copied)) {
- kfree_skb(syn_data);
- goto fallback;
- }
- if (copied != space) {
- skb_trim(syn_data, copied);
- space = copied;
+ if (space) {
+ int copied = copy_from_iter(skb_put(syn_data, space), space,
+ &fo->data->msg_iter);
+ if (unlikely(!copied)) {
+ kfree_skb(syn_data);
+ goto fallback;
+ }
+ if (copied != space) {
+ skb_trim(syn_data, copied);
+ space = copied;
+ }
}
-
/* No more data pending in inet_wait_for_connect() */
if (space == fo->size)
fo->data = NULL;
@@ -3352,8 +3355,9 @@ void tcp_send_ack(struct sock *sk)
* tcp_transmit_skb() will set the ownership to this
* sock.
*/
- buff = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
- if (!buff) {
+ buff = alloc_skb(MAX_TCP_HEADER,
+ sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
+ if (unlikely(!buff)) {
inet_csk_schedule_ack(sk);
inet_csk(sk)->icsk_ack.ato = TCP_ATO_MIN;
inet_csk_reset_xmit_timer(sk, ICSK_TIME_DACK,
@@ -3375,7 +3379,7 @@ void tcp_send_ack(struct sock *sk)
/* Send it off, this clears delayed acks for us. */
skb_mstamp_get(&buff->skb_mstamp);
- tcp_transmit_skb(sk, buff, 0, sk_gfp_atomic(sk, GFP_ATOMIC));
+ tcp_transmit_skb(sk, buff, 0, (__force gfp_t)0);
}
EXPORT_SYMBOL_GPL(tcp_send_ack);
@@ -3396,7 +3400,8 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
struct sk_buff *skb;
/* We don't queue it, tcp_transmit_skb() sets ownership. */
- skb = alloc_skb(MAX_TCP_HEADER, sk_gfp_atomic(sk, GFP_ATOMIC));
+ skb = alloc_skb(MAX_TCP_HEADER,
+ sk_gfp_mask(sk, GFP_ATOMIC | __GFP_NOWARN));
if (!skb)
return -1;
@@ -3409,7 +3414,7 @@ static int tcp_xmit_probe_skb(struct sock *sk, int urgent, int mib)
tcp_init_nondata_skb(skb, tp->snd_una - !urgent, TCPHDR_ACK);
skb_mstamp_get(&skb->skb_mstamp);
NET_INC_STATS(sock_net(sk), mib);
- return tcp_transmit_skb(sk, skb, 0, GFP_ATOMIC);
+ return tcp_transmit_skb(sk, skb, 0, (__force gfp_t)0);
}
void tcp_send_window_probe(struct sock *sk)
@@ -3470,6 +3475,7 @@ void tcp_send_probe0(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
unsigned long probe_max;
int err;
@@ -3483,7 +3489,7 @@ void tcp_send_probe0(struct sock *sk)
}
if (err <= 0) {
- if (icsk->icsk_backoff < sysctl_tcp_retries2)
+ if (icsk->icsk_backoff < net->ipv4.sysctl_tcp_retries2)
icsk->icsk_backoff++;
icsk->icsk_probes_out++;
probe_max = TCP_RTO_MAX;
diff --git a/net/ipv4/tcp_probe.c b/net/ipv4/tcp_probe.c
index ebf5ff57526e..f6c50af24a64 100644
--- a/net/ipv4/tcp_probe.c
+++ b/net/ipv4/tcp_probe.c
@@ -187,13 +187,13 @@ static int tcpprobe_sprint(char *tbuf, int n)
{
const struct tcp_log *p
= tcp_probe.log + tcp_probe.tail;
- struct timespec tv
- = ktime_to_timespec(ktime_sub(p->tstamp, tcp_probe.start));
+ struct timespec64 ts
+ = ktime_to_timespec64(ktime_sub(p->tstamp, tcp_probe.start));
return scnprintf(tbuf, n,
"%lu.%09lu %pISpc %pISpc %d %#x %#x %u %u %u %u %u\n",
- (unsigned long)tv.tv_sec,
- (unsigned long)tv.tv_nsec,
+ (unsigned long)ts.tv_sec,
+ (unsigned long)ts.tv_nsec,
&p->src, &p->dst, p->length, p->snd_nxt, p->snd_una,
p->snd_cwnd, p->ssthresh, p->snd_wnd, p->srtt, p->rcv_wnd);
}
diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c
index c9c716a483e4..49bc474f8e35 100644
--- a/net/ipv4/tcp_timer.c
+++ b/net/ipv4/tcp_timer.c
@@ -22,14 +22,6 @@
#include <linux/gfp.h>
#include <net/tcp.h>
-int sysctl_tcp_syn_retries __read_mostly = TCP_SYN_RETRIES;
-int sysctl_tcp_synack_retries __read_mostly = TCP_SYNACK_RETRIES;
-int sysctl_tcp_keepalive_time __read_mostly = TCP_KEEPALIVE_TIME;
-int sysctl_tcp_keepalive_probes __read_mostly = TCP_KEEPALIVE_PROBES;
-int sysctl_tcp_keepalive_intvl __read_mostly = TCP_KEEPALIVE_INTVL;
-int sysctl_tcp_retries1 __read_mostly = TCP_RETR1;
-int sysctl_tcp_retries2 __read_mostly = TCP_RETR2;
-int sysctl_tcp_orphan_retries __read_mostly;
int sysctl_tcp_thin_linear_timeouts __read_mostly;
static void tcp_write_err(struct sock *sk)
@@ -85,7 +77,7 @@ static int tcp_out_of_resources(struct sock *sk, bool do_reset)
/* Calculate maximal number or retries on an orphaned socket. */
static int tcp_orphan_retries(struct sock *sk, bool alive)
{
- int retries = sysctl_tcp_orphan_retries; /* May be zero. */
+ int retries = sock_net(sk)->ipv4.sysctl_tcp_orphan_retries; /* May be zero. */
/* We know from an ICMP that something is wrong. */
if (sk->sk_err_soft && !alive)
@@ -160,6 +152,7 @@ static int tcp_write_timeout(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
int retry_until;
bool do_reset, syn_set = false;
@@ -168,21 +161,33 @@ static int tcp_write_timeout(struct sock *sk)
dst_negative_advice(sk);
if (tp->syn_fastopen || tp->syn_data)
tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
- if (tp->syn_data)
+ if (tp->syn_data && icsk->icsk_retransmits == 1)
NET_INC_STATS_BH(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
- retry_until = icsk->icsk_syn_retries ? : sysctl_tcp_syn_retries;
+ retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
syn_set = true;
} else {
- if (retransmits_timed_out(sk, sysctl_tcp_retries1, 0, 0)) {
+ if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1, 0, 0)) {
+ /* Some middle-boxes may black-hole Fast Open _after_
+ * the handshake. Therefore we conservatively disable
+ * Fast Open on this path on recurring timeouts with
+ * few or zero bytes acked after Fast Open.
+ */
+ if (tp->syn_data_acked &&
+ tp->bytes_acked <= tp->rx_opt.mss_clamp) {
+ tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
+ if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1)
+ NET_INC_STATS_BH(sock_net(sk),
+ LINUX_MIB_TCPFASTOPENACTIVEFAIL);
+ }
/* Black hole detection */
tcp_mtu_probing(icsk, sk);
dst_negative_advice(sk);
}
- retry_until = sysctl_tcp_retries2;
+ retry_until = net->ipv4.sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
const bool alive = icsk->icsk_rto < TCP_RTO_MAX;
@@ -296,7 +301,7 @@ static void tcp_probe_timer(struct sock *sk)
(s32)(tcp_time_stamp - start_ts) > icsk->icsk_user_timeout)
goto abort;
- max_probes = sysctl_tcp_retries2;
+ max_probes = sock_net(sk)->ipv4.sysctl_tcp_retries2;
if (sock_flag(sk, SOCK_DEAD)) {
const bool alive = inet_csk_rto_backoff(icsk, TCP_RTO_MAX) < TCP_RTO_MAX;
@@ -323,7 +328,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
int max_retries = icsk->icsk_syn_retries ? :
- sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
+ sock_net(sk)->ipv4.sysctl_tcp_synack_retries + 1; /* add one more retry for fastopen */
struct request_sock *req;
req = tcp_sk(sk)->fastopen_rsk;
@@ -351,6 +356,7 @@ static void tcp_fastopen_synack_timer(struct sock *sk)
void tcp_retransmit_timer(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
+ struct net *net = sock_net(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
if (tp->fastopen_rsk) {
@@ -481,7 +487,7 @@ out_reset_timer:
icsk->icsk_rto = min(icsk->icsk_rto << 1, TCP_RTO_MAX);
}
inet_csk_reset_xmit_timer(sk, ICSK_TIME_RETRANS, icsk->icsk_rto, TCP_RTO_MAX);
- if (retransmits_timed_out(sk, sysctl_tcp_retries1 + 1, 0, 0))
+ if (retransmits_timed_out(sk, net->ipv4.sysctl_tcp_retries1 + 1, 0, 0))
__sk_dst_reset(sk);
out:;
diff --git a/net/ipv4/tcp_yeah.c b/net/ipv4/tcp_yeah.c
index 17d35662930d..3e6a472e6b88 100644
--- a/net/ipv4/tcp_yeah.c
+++ b/net/ipv4/tcp_yeah.c
@@ -219,7 +219,7 @@ static u32 tcp_yeah_ssthresh(struct sock *sk)
yeah->fast_count = 0;
yeah->reno_count = max(yeah->reno_count>>1, 2U);
- return tp->snd_cwnd - reduction;
+ return max_t(int, tp->snd_cwnd - reduction, 2);
}
static struct tcp_congestion_ops tcp_yeah __read_mostly = {
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index 24ec14f9825c..08eed5e16df0 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -100,7 +100,6 @@
#include <linux/slab.h>
#include <net/tcp_states.h>
#include <linux/skbuff.h>
-#include <linux/netdevice.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
#include <net/net_namespace.h>
@@ -114,6 +113,7 @@
#include <trace/events/skb.h>
#include <net/busy_poll.h>
#include "udp_impl.h"
+#include <net/sock_reuseport.h>
struct udp_table udp_table __read_mostly;
EXPORT_SYMBOL(udp_table);
@@ -138,7 +138,8 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
unsigned long *bitmap,
struct sock *sk,
int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2),
+ const struct sock *sk2,
+ bool match_wildcard),
unsigned int log)
{
struct sock *sk2;
@@ -153,8 +154,9 @@ static int udp_lib_lport_inuse(struct net *net, __u16 num,
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
(!sk2->sk_reuseport || !sk->sk_reuseport ||
+ rcu_access_pointer(sk->sk_reuseport_cb) ||
!uid_eq(uid, sock_i_uid(sk2))) &&
- saddr_comp(sk, sk2)) {
+ saddr_comp(sk, sk2, true)) {
if (!bitmap)
return 1;
__set_bit(udp_sk(sk2)->udp_port_hash >> log, bitmap);
@@ -171,7 +173,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
struct udp_hslot *hslot2,
struct sock *sk,
int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2))
+ const struct sock *sk2,
+ bool match_wildcard))
{
struct sock *sk2;
struct hlist_nulls_node *node;
@@ -187,8 +190,9 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
(!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if ||
sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
(!sk2->sk_reuseport || !sk->sk_reuseport ||
+ rcu_access_pointer(sk->sk_reuseport_cb) ||
!uid_eq(uid, sock_i_uid(sk2))) &&
- saddr_comp(sk, sk2)) {
+ saddr_comp(sk, sk2, true)) {
res = 1;
break;
}
@@ -197,6 +201,35 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
return res;
}
+static int udp_reuseport_add_sock(struct sock *sk, struct udp_hslot *hslot,
+ int (*saddr_same)(const struct sock *sk1,
+ const struct sock *sk2,
+ bool match_wildcard))
+{
+ struct net *net = sock_net(sk);
+ struct hlist_nulls_node *node;
+ kuid_t uid = sock_i_uid(sk);
+ struct sock *sk2;
+
+ sk_nulls_for_each(sk2, node, &hslot->head) {
+ if (net_eq(sock_net(sk2), net) &&
+ sk2 != sk &&
+ sk2->sk_family == sk->sk_family &&
+ ipv6_only_sock(sk2) == ipv6_only_sock(sk) &&
+ (udp_sk(sk2)->udp_port_hash == udp_sk(sk)->udp_port_hash) &&
+ (sk2->sk_bound_dev_if == sk->sk_bound_dev_if) &&
+ sk2->sk_reuseport && uid_eq(uid, sock_i_uid(sk2)) &&
+ (*saddr_same)(sk, sk2, false)) {
+ return reuseport_add_sock(sk, sk2);
+ }
+ }
+
+ /* Initial allocation may have already happened via setsockopt */
+ if (!rcu_access_pointer(sk->sk_reuseport_cb))
+ return reuseport_alloc(sk);
+ return 0;
+}
+
/**
* udp_lib_get_port - UDP/-Lite port lookup for IPv4 and IPv6
*
@@ -208,7 +241,8 @@ static int udp_lib_lport_inuse2(struct net *net, __u16 num,
*/
int udp_lib_get_port(struct sock *sk, unsigned short snum,
int (*saddr_comp)(const struct sock *sk1,
- const struct sock *sk2),
+ const struct sock *sk2,
+ bool match_wildcard),
unsigned int hash2_nulladdr)
{
struct udp_hslot *hslot, *hslot2;
@@ -291,6 +325,14 @@ found:
udp_sk(sk)->udp_port_hash = snum;
udp_sk(sk)->udp_portaddr_hash ^= snum;
if (sk_unhashed(sk)) {
+ if (sk->sk_reuseport &&
+ udp_reuseport_add_sock(sk, hslot, saddr_comp)) {
+ inet_sk(sk)->inet_num = 0;
+ udp_sk(sk)->udp_port_hash = 0;
+ udp_sk(sk)->udp_portaddr_hash ^= snum;
+ goto fail_unlock;
+ }
+
sk_nulls_add_node_rcu(sk, &hslot->head);
hslot->count++;
sock_prot_inuse_add(sock_net(sk), sk->sk_prot, 1);
@@ -310,13 +352,22 @@ fail:
}
EXPORT_SYMBOL(udp_lib_get_port);
-static int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2)
+/* match_wildcard == true: 0.0.0.0 equals to any IPv4 addresses
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ * 0.0.0.0 only equals to 0.0.0.0
+ */
+int ipv4_rcv_saddr_equal(const struct sock *sk1, const struct sock *sk2,
+ bool match_wildcard)
{
struct inet_sock *inet1 = inet_sk(sk1), *inet2 = inet_sk(sk2);
- return (!ipv6_only_sock(sk2) &&
- (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr ||
- inet1->inet_rcv_saddr == inet2->inet_rcv_saddr));
+ if (!ipv6_only_sock(sk2)) {
+ if (inet1->inet_rcv_saddr == inet2->inet_rcv_saddr)
+ return 1;
+ if (!inet1->inet_rcv_saddr || !inet2->inet_rcv_saddr)
+ return match_wildcard;
+ }
+ return 0;
}
static u32 udp4_portaddr_hash(const struct net *net, __be32 saddr,
@@ -442,11 +493,13 @@ static u32 udp_ehashfn(const struct net *net, const __be32 laddr,
static struct sock *udp4_lib_lookup2(struct net *net,
__be32 saddr, __be16 sport,
__be32 daddr, unsigned int hnum, int dif,
- struct udp_hslot *hslot2, unsigned int slot2)
+ struct udp_hslot *hslot2, unsigned int slot2,
+ struct sk_buff *skb)
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
int score, badness, matches = 0, reuseport = 0;
+ bool select_ok = true;
u32 hash = 0;
begin:
@@ -462,6 +515,17 @@ begin:
if (reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
+ if (select_ok) {
+ struct sock *sk2;
+
+ sk2 = reuseport_select_sock(sk, hash, skb,
+ sizeof(struct udphdr));
+ if (sk2) {
+ result = sk2;
+ select_ok = false;
+ goto found;
+ }
+ }
matches = 1;
}
} else if (score == badness && reuseport) {
@@ -479,6 +543,7 @@ begin:
if (get_nulls_value(node) != slot2)
goto begin;
if (result) {
+found:
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
result = NULL;
else if (unlikely(compute_score2(result, net, saddr, sport,
@@ -495,7 +560,7 @@ begin:
*/
struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
__be16 sport, __be32 daddr, __be16 dport,
- int dif, struct udp_table *udptable)
+ int dif, struct udp_table *udptable, struct sk_buff *skb)
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
@@ -503,6 +568,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
int score, badness, matches = 0, reuseport = 0;
+ bool select_ok = true;
u32 hash = 0;
rcu_read_lock();
@@ -515,7 +581,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, slot2);
+ hslot2, slot2, skb);
if (!result) {
hash2 = udp4_portaddr_hash(net, htonl(INADDR_ANY), hnum);
slot2 = hash2 & udptable->mask;
@@ -525,7 +591,7 @@ struct sock *__udp4_lib_lookup(struct net *net, __be32 saddr,
result = udp4_lib_lookup2(net, saddr, sport,
htonl(INADDR_ANY), hnum, dif,
- hslot2, slot2);
+ hslot2, slot2, skb);
}
rcu_read_unlock();
return result;
@@ -543,6 +609,17 @@ begin:
if (reuseport) {
hash = udp_ehashfn(net, daddr, hnum,
saddr, sport);
+ if (select_ok) {
+ struct sock *sk2;
+
+ sk2 = reuseport_select_sock(sk, hash, skb,
+ sizeof(struct udphdr));
+ if (sk2) {
+ result = sk2;
+ select_ok = false;
+ goto found;
+ }
+ }
matches = 1;
}
} else if (score == badness && reuseport) {
@@ -561,6 +638,7 @@ begin:
goto begin;
if (result) {
+found:
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
result = NULL;
else if (unlikely(compute_score(result, net, saddr, hnum, sport,
@@ -582,13 +660,14 @@ static inline struct sock *__udp4_lib_lookup_skb(struct sk_buff *skb,
return __udp4_lib_lookup(dev_net(skb_dst(skb)->dev), iph->saddr, sport,
iph->daddr, dport, inet_iif(skb),
- udptable);
+ udptable, skb);
}
struct sock *udp4_lib_lookup(struct net *net, __be32 saddr, __be16 sport,
__be32 daddr, __be16 dport, int dif)
{
- return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
+ return __udp4_lib_lookup(net, saddr, sport, daddr, dport, dif,
+ &udp_table, NULL);
}
EXPORT_SYMBOL_GPL(udp4_lib_lookup);
@@ -636,7 +715,8 @@ void __udp4_lib_err(struct sk_buff *skb, u32 info, struct udp_table *udptable)
struct net *net = dev_net(skb->dev);
sk = __udp4_lib_lookup(net, iph->daddr, uh->dest,
- iph->saddr, uh->source, skb->dev->ifindex, udptable);
+ iph->saddr, uh->source, skb->dev->ifindex, udptable,
+ NULL);
if (!sk) {
ICMP_INC_STATS_BH(net, ICMP_MIB_INERRORS);
return; /* No socket for error */
@@ -768,31 +848,20 @@ void udp_set_csum(bool nocheck, struct sk_buff *skb,
{
struct udphdr *uh = udp_hdr(skb);
- if (nocheck)
+ if (nocheck) {
uh->check = 0;
- else if (skb_is_gso(skb))
+ } else if (skb_is_gso(skb)) {
uh->check = ~udp_v4_check(len, saddr, daddr, 0);
- else if (skb_dst(skb) && skb_dst(skb)->dev &&
- (skb_dst(skb)->dev->features & NETIF_F_V4_CSUM)) {
-
- BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
-
+ } else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ uh->check = 0;
+ uh->check = udp_v4_check(len, saddr, daddr, lco_csum(skb));
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ } else {
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = skb_transport_header(skb) - skb->head;
skb->csum_offset = offsetof(struct udphdr, check);
uh->check = ~udp_v4_check(len, saddr, daddr, 0);
- } else {
- __wsum csum;
-
- BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
-
- uh->check = 0;
- csum = skb_checksum(skb, 0, len, 0);
- uh->check = udp_v4_check(len, saddr, daddr, csum);
- if (uh->check == 0)
- uh->check = CSUM_MANGLED_0;
-
- skb->ip_summed = CHECKSUM_UNNECESSARY;
}
}
EXPORT_SYMBOL(udp_set_csum);
@@ -967,8 +1036,10 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (msg->msg_controllen) {
err = ip_cmsg_send(sock_net(sk), msg, &ipc,
sk->sk_family == AF_INET6);
- if (err)
+ if (unlikely(err)) {
+ kfree(ipc.opt);
return err;
+ }
if (ipc.opt)
free = 1;
connected = 0;
@@ -1026,8 +1097,11 @@ int udp_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
flow_flags,
faddr, saddr, dport, inet->inet_sport);
- if (!saddr && ipc.oif)
- l3mdev_get_saddr(net, ipc.oif, fl4);
+ if (!saddr && ipc.oif) {
+ err = l3mdev_get_saddr(net, ipc.oif, fl4);
+ if (err < 0)
+ goto out;
+ }
security_sk_classify_flow(sk, flowi4_to_flowi(fl4));
rt = ip_route_output_flow(net, fl4, sk);
@@ -1271,6 +1345,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, size_t len, int noblock,
int peeked, off = 0;
int err;
int is_udplite = IS_UDPLITE(sk);
+ bool checksum_valid = false;
bool slow;
if (flags & MSG_ERRQUEUE)
@@ -1296,11 +1371,12 @@ try_again:
*/
if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
- if (udp_lib_checksum_complete(skb))
+ checksum_valid = !udp_lib_checksum_complete(skb);
+ if (!checksum_valid)
goto csum_copy_err;
}
- if (skb_csum_unnecessary(skb))
+ if (checksum_valid || skb_csum_unnecessary(skb))
err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
msg, copied);
else {
@@ -1396,6 +1472,8 @@ void udp_lib_unhash(struct sock *sk)
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
spin_lock_bh(&hslot->lock);
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
if (sk_nulls_del_node_init_rcu(sk)) {
hslot->count--;
inet_sk(sk)->inet_num = 0;
@@ -1423,22 +1501,28 @@ void udp_lib_rehash(struct sock *sk, u16 newhash)
hslot2 = udp_hashslot2(udptable, udp_sk(sk)->udp_portaddr_hash);
nhslot2 = udp_hashslot2(udptable, newhash);
udp_sk(sk)->udp_portaddr_hash = newhash;
- if (hslot2 != nhslot2) {
+
+ if (hslot2 != nhslot2 ||
+ rcu_access_pointer(sk->sk_reuseport_cb)) {
hslot = udp_hashslot(udptable, sock_net(sk),
udp_sk(sk)->udp_port_hash);
/* we must lock primary chain too */
spin_lock_bh(&hslot->lock);
-
- spin_lock(&hslot2->lock);
- hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
- hslot2->count--;
- spin_unlock(&hslot2->lock);
-
- spin_lock(&nhslot2->lock);
- hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
- &nhslot2->head);
- nhslot2->count++;
- spin_unlock(&nhslot2->lock);
+ if (rcu_access_pointer(sk->sk_reuseport_cb))
+ reuseport_detach_sock(sk);
+
+ if (hslot2 != nhslot2) {
+ spin_lock(&hslot2->lock);
+ hlist_nulls_del_init_rcu(&udp_sk(sk)->udp_portaddr_node);
+ hslot2->count--;
+ spin_unlock(&hslot2->lock);
+
+ spin_lock(&nhslot2->lock);
+ hlist_nulls_add_head_rcu(&udp_sk(sk)->udp_portaddr_node,
+ &nhslot2->head);
+ nhslot2->count++;
+ spin_unlock(&nhslot2->lock);
+ }
spin_unlock_bh(&hslot->lock);
}
@@ -1986,10 +2070,14 @@ void udp_v4_early_demux(struct sk_buff *skb)
if (!in_dev)
return;
- ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
- iph->protocol);
- if (!ours)
- return;
+ /* we are supposed to accept bcast packets */
+ if (skb->pkt_type == PACKET_MULTICAST) {
+ ours = ip_check_mc_rcu(in_dev, iph->daddr, iph->saddr,
+ iph->protocol);
+ if (!ours)
+ return;
+ }
+
sk = __udp4_lib_mcast_demux_lookup(net, uh->dest, iph->daddr,
uh->source, iph->saddr, dif);
} else if (skb->pkt_type == PACKET_HOST) {
diff --git a/net/ipv4/udp_diag.c b/net/ipv4/udp_diag.c
index 6116604bf6e8..df1966f3b6ec 100644
--- a/net/ipv4/udp_diag.c
+++ b/net/ipv4/udp_diag.c
@@ -44,7 +44,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
sk = __udp4_lib_lookup(net,
req->id.idiag_src[0], req->id.idiag_sport,
req->id.idiag_dst[0], req->id.idiag_dport,
- req->id.idiag_if, tbl);
+ req->id.idiag_if, tbl, NULL);
#if IS_ENABLED(CONFIG_IPV6)
else if (req->sdiag_family == AF_INET6)
sk = __udp6_lib_lookup(net,
@@ -52,7 +52,7 @@ static int udp_dump_one(struct udp_table *tbl, struct sk_buff *in_skb,
req->id.idiag_sport,
(struct in6_addr *)req->id.idiag_dst,
req->id.idiag_dport,
- req->id.idiag_if, tbl);
+ req->id.idiag_if, tbl, NULL);
#endif
else
goto out_nosk;
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index f9386160cbee..0ed2dafb7cc4 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -21,6 +21,7 @@ static struct udp_offload_priv __rcu *udp_offload_base __read_mostly;
struct udp_offload_priv {
struct udp_offload *offload;
+ possible_net_t net;
struct rcu_head rcu;
struct udp_offload_priv __rcu *next;
};
@@ -31,41 +32,65 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
netdev_features_t features),
__be16 new_protocol, bool is_ipv6)
{
+ int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
+ bool remcsum, need_csum, offload_csum, ufo;
struct sk_buff *segs = ERR_PTR(-EINVAL);
+ struct udphdr *uh = udp_hdr(skb);
u16 mac_offset = skb->mac_header;
- int mac_len = skb->mac_len;
- int tnl_hlen = skb_inner_mac_header(skb) - skb_transport_header(skb);
__be16 protocol = skb->protocol;
- netdev_features_t enc_features;
+ u16 mac_len = skb->mac_len;
int udp_offset, outer_hlen;
- unsigned int oldlen;
- bool need_csum = !!(skb_shinfo(skb)->gso_type &
- SKB_GSO_UDP_TUNNEL_CSUM);
- bool remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
- bool offload_csum = false, dont_encap = (need_csum || remcsum);
-
- oldlen = (u16)~skb->len;
+ __wsum partial;
if (unlikely(!pskb_may_pull(skb, tnl_hlen)))
goto out;
+ /* Adjust partial header checksum to negate old length.
+ * We cannot rely on the value contained in uh->len as it is
+ * possible that the actual value exceeds the boundaries of the
+ * 16 bit length field due to the header being added outside of an
+ * IP or IPv6 frame that was already limited to 64K - 1.
+ */
+ partial = csum_sub(csum_unfold(uh->check),
+ (__force __wsum)htonl(skb->len));
+
+ /* setup inner skb. */
skb->encapsulation = 0;
+ SKB_GSO_CB(skb)->encap_level = 0;
__skb_pull(skb, tnl_hlen);
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb_inner_network_offset(skb));
skb->mac_len = skb_inner_network_offset(skb);
skb->protocol = new_protocol;
+
+ need_csum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP_TUNNEL_CSUM);
skb->encap_hdr_csum = need_csum;
+
+ remcsum = !!(skb_shinfo(skb)->gso_type & SKB_GSO_TUNNEL_REMCSUM);
skb->remcsum_offload = remcsum;
+ ufo = !!(skb_shinfo(skb)->gso_type & SKB_GSO_UDP);
+
/* Try to offload checksum if possible */
offload_csum = !!(need_csum &&
(skb->dev->features &
- (is_ipv6 ? NETIF_F_V6_CSUM : NETIF_F_V4_CSUM)));
+ (is_ipv6 ? (NETIF_F_HW_CSUM | NETIF_F_IPV6_CSUM) :
+ (NETIF_F_HW_CSUM | NETIF_F_IP_CSUM))));
+
+ features &= skb->dev->hw_enc_features;
+
+ /* The only checksum offload we care about from here on out is the
+ * outer one so strip the existing checksum feature flags and
+ * instead set the flag based on our outer checksum offload value.
+ */
+ if (remcsum || ufo) {
+ features &= ~NETIF_F_CSUM_MASK;
+ if (!need_csum || offload_csum)
+ features |= NETIF_F_HW_CSUM;
+ }
/* segment inner packet. */
- enc_features = skb->dev->hw_enc_features & features;
- segs = gso_inner_segment(skb, enc_features);
+ segs = gso_inner_segment(skb, features);
if (IS_ERR_OR_NULL(segs)) {
skb_gso_error_unwind(skb, protocol, tnl_hlen, mac_offset,
mac_len);
@@ -76,17 +101,13 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
udp_offset = outer_hlen - tnl_hlen;
skb = segs;
do {
- struct udphdr *uh;
- int len;
- __be32 delta;
+ __be16 len;
- if (dont_encap) {
- skb->encapsulation = 0;
+ if (remcsum)
skb->ip_summed = CHECKSUM_NONE;
- } else {
- /* Only set up inner headers if we might be offloading
- * inner checksum.
- */
+
+ /* Set up inner headers if we are offloading inner checksum */
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
skb_reset_inner_headers(skb);
skb->encapsulation = 1;
}
@@ -94,43 +115,27 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct sk_buff *skb,
skb->mac_len = mac_len;
skb->protocol = protocol;
- skb_push(skb, outer_hlen);
+ __skb_push(skb, outer_hlen);
skb_reset_mac_header(skb);
skb_set_network_header(skb, mac_len);
skb_set_transport_header(skb, udp_offset);
- len = skb->len - udp_offset;
+ len = htons(skb->len - udp_offset);
uh = udp_hdr(skb);
- uh->len = htons(len);
+ uh->len = len;
if (!need_csum)
continue;
- delta = htonl(oldlen + len);
+ uh->check = ~csum_fold(csum_add(partial, (__force __wsum)len));
- uh->check = ~csum_fold((__force __wsum)
- ((__force u32)uh->check +
- (__force u32)delta));
- if (offload_csum) {
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_transport_header(skb) - skb->head;
- skb->csum_offset = offsetof(struct udphdr, check);
- } else if (remcsum) {
- /* Need to calculate checksum from scratch,
- * inner checksums are never when doing
- * remote_checksum_offload.
- */
-
- skb->csum = skb_checksum(skb, udp_offset,
- skb->len - udp_offset,
- 0);
- uh->check = csum_fold(skb->csum);
- if (uh->check == 0)
- uh->check = CSUM_MANGLED_0;
- } else {
+ if (skb->encapsulation || !offload_csum) {
uh->check = gso_make_checksum(skb, ~uh->check);
-
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
+ } else {
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_transport_header(skb) - skb->head;
+ skb->csum_offset = offsetof(struct udphdr, check);
}
} while ((skb = skb->next));
out:
@@ -233,6 +238,13 @@ static struct sk_buff *udp4_ufo_fragment(struct sk_buff *skb,
skb->ip_summed = CHECKSUM_NONE;
+ /* If there is no outer header we can fake a checksum offload
+ * due to the fact that we have already done the checksum in
+ * software prior to segmenting the frame.
+ */
+ if (!skb->encap_hdr_csum)
+ features |= NETIF_F_HW_CSUM;
+
/* Fragment the skb. IP headers of the fragments are updated in
* inet_gso_segment()
*/
@@ -241,13 +253,14 @@ out:
return segs;
}
-int udp_add_offload(struct udp_offload *uo)
+int udp_add_offload(struct net *net, struct udp_offload *uo)
{
struct udp_offload_priv *new_offload = kzalloc(sizeof(*new_offload), GFP_ATOMIC);
if (!new_offload)
return -ENOMEM;
+ write_pnet(&new_offload->net, net);
new_offload->offload = uo;
spin_lock(&udp_offload_lock);
@@ -299,19 +312,20 @@ struct sk_buff **udp_gro_receive(struct sk_buff **head, struct sk_buff *skb,
unsigned int off = skb_gro_offset(skb);
int flush = 1;
- if (NAPI_GRO_CB(skb)->udp_mark ||
+ if (NAPI_GRO_CB(skb)->encap_mark ||
(skb->ip_summed != CHECKSUM_PARTIAL &&
NAPI_GRO_CB(skb)->csum_cnt == 0 &&
!NAPI_GRO_CB(skb)->csum_valid))
goto out;
- /* mark that this skb passed once through the udp gro layer */
- NAPI_GRO_CB(skb)->udp_mark = 1;
+ /* mark that this skb passed once through the tunnel gro layer */
+ NAPI_GRO_CB(skb)->encap_mark = 1;
rcu_read_lock();
uo_priv = rcu_dereference(udp_offload_base);
for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
- if (uo_priv->offload->port == uh->dest &&
+ if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
+ uo_priv->offload->port == uh->dest &&
uo_priv->offload->callbacks.gro_receive)
goto unflush;
}
@@ -389,7 +403,8 @@ int udp_gro_complete(struct sk_buff *skb, int nhoff)
uo_priv = rcu_dereference(udp_offload_base);
for (; uo_priv != NULL; uo_priv = rcu_dereference(uo_priv->next)) {
- if (uo_priv->offload->port == uh->dest &&
+ if (net_eq(read_pnet(&uo_priv->net), dev_net(skb->dev)) &&
+ uo_priv->offload->port == uh->dest &&
uo_priv->offload->callbacks.gro_complete)
break;
}
diff --git a/net/ipv4/udp_tunnel.c b/net/ipv4/udp_tunnel.c
index aba428626b52..96599d1a1318 100644
--- a/net/ipv4/udp_tunnel.c
+++ b/net/ipv4/udp_tunnel.c
@@ -74,10 +74,10 @@ void setup_udp_tunnel_sock(struct net *net, struct socket *sock,
}
EXPORT_SYMBOL_GPL(setup_udp_tunnel_sock);
-int udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
- __be32 src, __be32 dst, __u8 tos, __u8 ttl,
- __be16 df, __be16 src_port, __be16 dst_port,
- bool xnet, bool nocheck)
+void udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
+ __be32 src, __be32 dst, __u8 tos, __u8 ttl,
+ __be16 df, __be16 src_port, __be16 dst_port,
+ bool xnet, bool nocheck)
{
struct udphdr *uh;
@@ -89,10 +89,11 @@ int udp_tunnel_xmit_skb(struct rtable *rt, struct sock *sk, struct sk_buff *skb,
uh->source = src_port;
uh->len = htons(skb->len);
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
udp_set_csum(nocheck, skb, src, dst, skb->len);
- return iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP,
- tos, ttl, df, xnet);
+ iptunnel_xmit(sk, rt, skb, src, dst, IPPROTO_UDP, tos, ttl, df, xnet);
}
EXPORT_SYMBOL_GPL(udp_tunnel_xmit_skb);
diff --git a/net/ipv4/xfrm4_policy.c b/net/ipv4/xfrm4_policy.c
index 1e0c3c835a63..7b0edb37a115 100644
--- a/net/ipv4/xfrm4_policy.c
+++ b/net/ipv4/xfrm4_policy.c
@@ -259,7 +259,7 @@ static void xfrm4_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
xfrm_dst_ifdown(dst, dev);
}
-static struct dst_ops xfrm4_dst_ops = {
+static struct dst_ops xfrm4_dst_ops_template = {
.family = AF_INET,
.gc = xfrm4_garbage_collect,
.update_pmtu = xfrm4_update_pmtu,
@@ -273,7 +273,7 @@ static struct dst_ops xfrm4_dst_ops = {
static struct xfrm_policy_afinfo xfrm4_policy_afinfo = {
.family = AF_INET,
- .dst_ops = &xfrm4_dst_ops,
+ .dst_ops = &xfrm4_dst_ops_template,
.dst_lookup = xfrm4_dst_lookup,
.get_saddr = xfrm4_get_saddr,
.decode_session = _decode_session4,
@@ -295,7 +295,7 @@ static struct ctl_table xfrm4_policy_table[] = {
{ }
};
-static int __net_init xfrm4_net_init(struct net *net)
+static int __net_init xfrm4_net_sysctl_init(struct net *net)
{
struct ctl_table *table;
struct ctl_table_header *hdr;
@@ -323,7 +323,7 @@ err_alloc:
return -ENOMEM;
}
-static void __net_exit xfrm4_net_exit(struct net *net)
+static void __net_exit xfrm4_net_sysctl_exit(struct net *net)
{
struct ctl_table *table;
@@ -335,12 +335,44 @@ static void __net_exit xfrm4_net_exit(struct net *net)
if (!net_eq(net, &init_net))
kfree(table);
}
+#else /* CONFIG_SYSCTL */
+static int inline xfrm4_net_sysctl_init(struct net *net)
+{
+ return 0;
+}
+
+static void inline xfrm4_net_sysctl_exit(struct net *net)
+{
+}
+#endif
+
+static int __net_init xfrm4_net_init(struct net *net)
+{
+ int ret;
+
+ memcpy(&net->xfrm.xfrm4_dst_ops, &xfrm4_dst_ops_template,
+ sizeof(xfrm4_dst_ops_template));
+ ret = dst_entries_init(&net->xfrm.xfrm4_dst_ops);
+ if (ret)
+ return ret;
+
+ ret = xfrm4_net_sysctl_init(net);
+ if (ret)
+ dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
+
+ return ret;
+}
+
+static void __net_exit xfrm4_net_exit(struct net *net)
+{
+ xfrm4_net_sysctl_exit(net);
+ dst_entries_destroy(&net->xfrm.xfrm4_dst_ops);
+}
static struct pernet_operations __net_initdata xfrm4_net_ops = {
.init = xfrm4_net_init,
.exit = xfrm4_net_exit,
};
-#endif
static void __init xfrm4_policy_init(void)
{
@@ -349,13 +381,9 @@ static void __init xfrm4_policy_init(void)
void __init xfrm4_init(void)
{
- dst_entries_init(&xfrm4_dst_ops);
-
xfrm4_state_init();
xfrm4_policy_init();
xfrm4_protocol_init();
-#ifdef CONFIG_SYSCTL
register_pernet_subsys(&xfrm4_net_ops);
-#endif
}
diff --git a/net/ipv6/Kconfig b/net/ipv6/Kconfig
index 983bb999738c..11e875ffd7ac 100644
--- a/net/ipv6/Kconfig
+++ b/net/ipv6/Kconfig
@@ -69,6 +69,7 @@ config INET6_ESP
select CRYPTO_CBC
select CRYPTO_SHA1
select CRYPTO_DES
+ select CRYPTO_ECHAINIV
---help---
Support for IPsec ESP.
@@ -94,6 +95,7 @@ config IPV6_MIP6
config IPV6_ILA
tristate "IPv6: Identifier Locator Addressing (ILA)"
+ depends on NETFILTER
select LWTUNNEL
---help---
Support for IPv6 Identifier Locator Addressing (ILA).
@@ -205,6 +207,7 @@ config IPV6_NDISC_NODETYPE
config IPV6_TUNNEL
tristate "IPv6: IP-in-IPv6 tunnel (RFC2473)"
select INET6_TUNNEL
+ select DST_CACHE
---help---
Support for IPv6-in-IPv6 and IPv4-in-IPv6 tunnels described in
RFC 2473.
diff --git a/net/ipv6/Makefile b/net/ipv6/Makefile
index 2c900c7b7eb1..2fbd90bf8d33 100644
--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -34,7 +34,7 @@ obj-$(CONFIG_INET6_XFRM_MODE_TUNNEL) += xfrm6_mode_tunnel.o
obj-$(CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION) += xfrm6_mode_ro.o
obj-$(CONFIG_INET6_XFRM_MODE_BEET) += xfrm6_mode_beet.o
obj-$(CONFIG_IPV6_MIP6) += mip6.o
-obj-$(CONFIG_IPV6_ILA) += ila.o
+obj-$(CONFIG_IPV6_ILA) += ila/
obj-$(CONFIG_NETFILTER) += netfilter/
obj-$(CONFIG_IPV6_VTI) += ip6_vti.o
diff --git a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c
index d72fa90d6feb..27aed1afcf81 100644
--- a/net/ipv6/addrconf.c
+++ b/net/ipv6/addrconf.c
@@ -70,7 +70,7 @@
#include <net/sock.h>
#include <net/snmp.h>
-#include <net/af_ieee802154.h>
+#include <net/6lowpan.h>
#include <net/firewire.h>
#include <net/ipv6.h>
#include <net/protocol.h>
@@ -216,6 +216,7 @@ static struct ipv6_devconf ipv6_devconf __read_mostly = {
},
.use_oif_addrs_only = 0,
.ignore_routes_with_linkdown = 0,
+ .keep_addr_on_down = 0,
};
static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
@@ -260,6 +261,7 @@ static struct ipv6_devconf ipv6_devconf_dflt __read_mostly = {
},
.use_oif_addrs_only = 0,
.ignore_routes_with_linkdown = 0,
+ .keep_addr_on_down = 0,
};
/* Check if a valid qdisc is available */
@@ -350,6 +352,12 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
setup_timer(&ndev->rs_timer, addrconf_rs_timer,
(unsigned long)ndev);
memcpy(&ndev->cnf, dev_net(dev)->ipv6.devconf_dflt, sizeof(ndev->cnf));
+
+ if (ndev->cnf.stable_secret.initialized)
+ ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_STABLE_PRIVACY;
+ else
+ ndev->addr_gen_mode = IN6_ADDR_GEN_MODE_EUI64;
+
ndev->cnf.mtu6 = dev->mtu;
ndev->cnf.sysctl = NULL;
ndev->nd_parms = neigh_parms_alloc(dev, &nd_tbl);
@@ -418,6 +426,7 @@ static struct inet6_dev *ipv6_add_dev(struct net_device *dev)
if (err) {
ipv6_mc_destroy_dev(ndev);
del_timer(&ndev->regen_timer);
+ snmp6_unregister_dev(ndev);
goto err_release;
}
/* protected by rtnl_lock */
@@ -464,18 +473,21 @@ static int inet6_netconf_msgsize_devconf(int type)
{
int size = NLMSG_ALIGN(sizeof(struct netconfmsg))
+ nla_total_size(4); /* NETCONFA_IFINDEX */
+ bool all = false;
+
+ if (type == NETCONFA_ALL)
+ all = true;
- /* type -1 is used for ALL */
- if (type == -1 || type == NETCONFA_FORWARDING)
+ if (all || type == NETCONFA_FORWARDING)
size += nla_total_size(4);
#ifdef CONFIG_IPV6_MROUTE
- if (type == -1 || type == NETCONFA_MC_FORWARDING)
+ if (all || type == NETCONFA_MC_FORWARDING)
size += nla_total_size(4);
#endif
- if (type == -1 || type == NETCONFA_PROXY_NEIGH)
+ if (all || type == NETCONFA_PROXY_NEIGH)
size += nla_total_size(4);
- if (type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
+ if (all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN)
size += nla_total_size(4);
return size;
@@ -488,33 +500,36 @@ static int inet6_netconf_fill_devconf(struct sk_buff *skb, int ifindex,
{
struct nlmsghdr *nlh;
struct netconfmsg *ncm;
+ bool all = false;
nlh = nlmsg_put(skb, portid, seq, event, sizeof(struct netconfmsg),
flags);
if (!nlh)
return -EMSGSIZE;
+ if (type == NETCONFA_ALL)
+ all = true;
+
ncm = nlmsg_data(nlh);
ncm->ncm_family = AF_INET6;
if (nla_put_s32(skb, NETCONFA_IFINDEX, ifindex) < 0)
goto nla_put_failure;
- /* type -1 is used for ALL */
- if ((type == -1 || type == NETCONFA_FORWARDING) &&
+ if ((all || type == NETCONFA_FORWARDING) &&
nla_put_s32(skb, NETCONFA_FORWARDING, devconf->forwarding) < 0)
goto nla_put_failure;
#ifdef CONFIG_IPV6_MROUTE
- if ((type == -1 || type == NETCONFA_MC_FORWARDING) &&
+ if ((all || type == NETCONFA_MC_FORWARDING) &&
nla_put_s32(skb, NETCONFA_MC_FORWARDING,
devconf->mc_forwarding) < 0)
goto nla_put_failure;
#endif
- if ((type == -1 || type == NETCONFA_PROXY_NEIGH) &&
+ if ((all || type == NETCONFA_PROXY_NEIGH) &&
nla_put_s32(skb, NETCONFA_PROXY_NEIGH, devconf->proxy_ndp) < 0)
goto nla_put_failure;
- if ((type == -1 || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
+ if ((all || type == NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN) &&
nla_put_s32(skb, NETCONFA_IGNORE_ROUTES_WITH_LINKDOWN,
devconf->ignore_routes_with_linkdown) < 0)
goto nla_put_failure;
@@ -576,7 +591,7 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
if (err < 0)
goto errout;
- err = EINVAL;
+ err = -EINVAL;
if (!tb[NETCONFA_IFINDEX])
goto errout;
@@ -600,14 +615,14 @@ static int inet6_netconf_get_devconf(struct sk_buff *in_skb,
}
err = -ENOBUFS;
- skb = nlmsg_new(inet6_netconf_msgsize_devconf(-1), GFP_ATOMIC);
+ skb = nlmsg_new(inet6_netconf_msgsize_devconf(NETCONFA_ALL), GFP_ATOMIC);
if (!skb)
goto errout;
err = inet6_netconf_fill_devconf(skb, ifindex, devconf,
NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, RTM_NEWNETCONF, 0,
- -1);
+ NETCONFA_ALL);
if (err < 0) {
/* -EMSGSIZE implies BUG in inet6_netconf_msgsize_devconf() */
WARN_ON(err == -EMSGSIZE);
@@ -651,7 +666,7 @@ static int inet6_netconf_dump_devconf(struct sk_buff *skb,
cb->nlh->nlmsg_seq,
RTM_NEWNETCONF,
NLM_F_MULTI,
- -1) < 0) {
+ NETCONFA_ALL) < 0) {
rcu_read_unlock();
goto done;
}
@@ -667,7 +682,7 @@ cont:
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNETCONF, NLM_F_MULTI,
- -1) < 0)
+ NETCONFA_ALL) < 0)
goto done;
else
h++;
@@ -678,7 +693,7 @@ cont:
NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq,
RTM_NEWNETCONF, NLM_F_MULTI,
- -1) < 0)
+ NETCONFA_ALL) < 0)
goto done;
else
h++;
@@ -1765,12 +1780,13 @@ struct inet6_ifaddr *ipv6_get_ifaddr(struct net *net, const struct in6_addr *add
static void addrconf_dad_stop(struct inet6_ifaddr *ifp, int dad_failed)
{
+ if (dad_failed)
+ ifp->flags |= IFA_F_DADFAILED;
+
if (ifp->flags&IFA_F_PERMANENT) {
spin_lock_bh(&ifp->lock);
addrconf_del_dad_work(ifp);
ifp->flags |= IFA_F_TENTATIVE;
- if (dad_failed)
- ifp->flags |= IFA_F_DADFAILED;
spin_unlock_bh(&ifp->lock);
if (dad_failed)
ipv6_ifa_notify(0, ifp);
@@ -1946,9 +1962,9 @@ static void addrconf_leave_anycast(struct inet6_ifaddr *ifp)
static int addrconf_ifid_eui64(u8 *eui, struct net_device *dev)
{
- if (dev->addr_len != IEEE802154_ADDR_LEN)
+ if (dev->addr_len != EUI64_ADDR_LEN)
return -1;
- memcpy(eui, dev->dev_addr, 8);
+ memcpy(eui, dev->dev_addr, EUI64_ADDR_LEN);
eui[0] ^= 2;
return 0;
}
@@ -2040,7 +2056,6 @@ static int ipv6_generate_eui64(u8 *eui, struct net_device *dev)
case ARPHRD_IPGRE:
return addrconf_ifid_gre(eui, dev);
case ARPHRD_6LOWPAN:
- case ARPHRD_IEEE802154:
return addrconf_ifid_eui64(eui, dev);
case ARPHRD_IEEE1394:
return addrconf_ifid_ieee1394(eui, dev);
@@ -2313,6 +2328,12 @@ static void manage_tempaddrs(struct inet6_dev *idev,
}
}
+static bool is_addr_mode_generate_stable(struct inet6_dev *idev)
+{
+ return idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY ||
+ idev->addr_gen_mode == IN6_ADDR_GEN_MODE_RANDOM;
+}
+
void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
{
struct prefix_info *pinfo;
@@ -2426,8 +2447,7 @@ void addrconf_prefix_rcv(struct net_device *dev, u8 *opt, int len, bool sllao)
in6_dev->token.s6_addr + 8, 8);
read_unlock_bh(&in6_dev->lock);
tokenized = true;
- } else if (in6_dev->addr_gen_mode ==
- IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+ } else if (is_addr_mode_generate_stable(in6_dev) &&
!ipv6_generate_stable_address(&addr, 0,
in6_dev)) {
addr_flags |= IFA_F_STABLE_PRIVACY;
@@ -2454,7 +2474,7 @@ ok:
#ifdef CONFIG_IPV6_OPTIMISTIC_DAD
if (in6_dev->cnf.optimistic_dad &&
!net->ipv6.devconf_all->forwarding && sllao)
- addr_flags = IFA_F_OPTIMISTIC;
+ addr_flags |= IFA_F_OPTIMISTIC;
#endif
/* Do not allow to create too much of autoconfigured
@@ -3027,6 +3047,17 @@ retry:
return 0;
}
+static void ipv6_gen_mode_random_init(struct inet6_dev *idev)
+{
+ struct ipv6_stable_secret *s = &idev->cnf.stable_secret;
+
+ if (s->initialized)
+ return;
+ s = &idev->cnf.stable_secret;
+ get_random_bytes(&s->secret, sizeof(s->secret));
+ s->initialized = true;
+}
+
static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
{
struct in6_addr addr;
@@ -3037,13 +3068,18 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
ipv6_addr_set(&addr, htonl(0xFE800000), 0, 0, 0);
- if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY) {
+ switch (idev->addr_gen_mode) {
+ case IN6_ADDR_GEN_MODE_RANDOM:
+ ipv6_gen_mode_random_init(idev);
+ /* fallthrough */
+ case IN6_ADDR_GEN_MODE_STABLE_PRIVACY:
if (!ipv6_generate_stable_address(&addr, 0, idev))
addrconf_add_linklocal(idev, &addr,
IFA_F_STABLE_PRIVACY);
else if (prefix_route)
addrconf_prefix_route(&addr, 64, idev->dev, 0, 0);
- } else if (idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64) {
+ break;
+ case IN6_ADDR_GEN_MODE_EUI64:
/* addrconf_add_linklocal also adds a prefix_route and we
* only need to care about prefix routes if ipv6_generate_eui64
* couldn't generate one.
@@ -3052,6 +3088,11 @@ static void addrconf_addr_gen(struct inet6_dev *idev, bool prefix_route)
addrconf_add_linklocal(idev, &addr, 0);
else if (prefix_route)
addrconf_prefix_route(&addr, 64, idev->dev, 0, 0);
+ break;
+ case IN6_ADDR_GEN_MODE_NONE:
+ default:
+ /* will not add any link local address */
+ break;
}
}
@@ -3065,10 +3106,10 @@ static void addrconf_dev_config(struct net_device *dev)
(dev->type != ARPHRD_FDDI) &&
(dev->type != ARPHRD_ARCNET) &&
(dev->type != ARPHRD_INFINIBAND) &&
- (dev->type != ARPHRD_IEEE802154) &&
(dev->type != ARPHRD_IEEE1394) &&
(dev->type != ARPHRD_TUNNEL6) &&
- (dev->type != ARPHRD_6LOWPAN)) {
+ (dev->type != ARPHRD_6LOWPAN) &&
+ (dev->type != ARPHRD_NONE)) {
/* Alas, we support only Ethernet autoconfiguration. */
return;
}
@@ -3077,6 +3118,11 @@ static void addrconf_dev_config(struct net_device *dev)
if (IS_ERR(idev))
return;
+ /* this device type has no EUI support */
+ if (dev->type == ARPHRD_NONE &&
+ idev->addr_gen_mode == IN6_ADDR_GEN_MODE_EUI64)
+ idev->addr_gen_mode = IN6_ADDR_GEN_MODE_RANDOM;
+
addrconf_addr_gen(idev, false);
}
@@ -3130,6 +3176,81 @@ static void addrconf_gre_config(struct net_device *dev)
}
#endif
+#if IS_ENABLED(CONFIG_NET_L3_MASTER_DEV)
+/* If the host route is cached on the addr struct make sure it is associated
+ * with the proper table. e.g., enslavement can change and if so the cached
+ * host route needs to move to the new table.
+ */
+static void l3mdev_check_host_rt(struct inet6_dev *idev,
+ struct inet6_ifaddr *ifp)
+{
+ if (ifp->rt) {
+ u32 tb_id = l3mdev_fib_table(idev->dev) ? : RT6_TABLE_LOCAL;
+
+ if (tb_id != ifp->rt->rt6i_table->tb6_id) {
+ ip6_del_rt(ifp->rt);
+ ifp->rt = NULL;
+ }
+ }
+}
+#else
+static void l3mdev_check_host_rt(struct inet6_dev *idev,
+ struct inet6_ifaddr *ifp)
+{
+}
+#endif
+
+static int fixup_permanent_addr(struct inet6_dev *idev,
+ struct inet6_ifaddr *ifp)
+{
+ l3mdev_check_host_rt(idev, ifp);
+
+ if (!ifp->rt) {
+ struct rt6_info *rt;
+
+ rt = addrconf_dst_alloc(idev, &ifp->addr, false);
+ if (unlikely(IS_ERR(rt)))
+ return PTR_ERR(rt);
+
+ ifp->rt = rt;
+ }
+
+ if (!(ifp->flags & IFA_F_NOPREFIXROUTE)) {
+ addrconf_prefix_route(&ifp->addr, ifp->prefix_len,
+ idev->dev, 0, 0);
+ }
+
+ addrconf_dad_start(ifp);
+
+ return 0;
+}
+
+static void addrconf_permanent_addr(struct net_device *dev)
+{
+ struct inet6_ifaddr *ifp, *tmp;
+ struct inet6_dev *idev;
+
+ idev = __in6_dev_get(dev);
+ if (!idev)
+ return;
+
+ write_lock_bh(&idev->lock);
+
+ list_for_each_entry_safe(ifp, tmp, &idev->addr_list, if_list) {
+ if ((ifp->flags & IFA_F_PERMANENT) &&
+ fixup_permanent_addr(idev, ifp) < 0) {
+ write_unlock_bh(&idev->lock);
+ ipv6_del_addr(ifp);
+ write_lock_bh(&idev->lock);
+
+ net_info_ratelimited("%s: Failed to add prefix route for address %pI6c; dropping\n",
+ idev->dev->name, &ifp->addr);
+ }
+ }
+
+ write_unlock_bh(&idev->lock);
+}
+
static int addrconf_notify(struct notifier_block *this, unsigned long event,
void *ptr)
{
@@ -3215,6 +3336,9 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
run_pending = 1;
}
+ /* restore routes for permanent addresses */
+ addrconf_permanent_addr(dev);
+
switch (dev->type) {
#if IS_ENABLED(CONFIG_IPV6_SIT)
case ARPHRD_SIT:
@@ -3286,7 +3410,8 @@ static int addrconf_notify(struct notifier_block *this, unsigned long event,
case NETDEV_PRE_TYPE_CHANGE:
case NETDEV_POST_TYPE_CHANGE:
- addrconf_type_change(dev, event);
+ if (idev)
+ addrconf_type_change(dev, event);
break;
}
@@ -3317,7 +3442,10 @@ static int addrconf_ifdown(struct net_device *dev, int how)
{
struct net *net = dev_net(dev);
struct inet6_dev *idev;
- struct inet6_ifaddr *ifa;
+ struct inet6_ifaddr *ifa, *tmp;
+ struct list_head del_list;
+ int _keep_addr;
+ bool keep_addr;
int state, i;
ASSERT_RTNL();
@@ -3344,6 +3472,16 @@ static int addrconf_ifdown(struct net_device *dev, int how)
}
+ /* aggregate the system setting and interface setting */
+ _keep_addr = net->ipv6.devconf_all->keep_addr_on_down;
+ if (!_keep_addr)
+ _keep_addr = idev->cnf.keep_addr_on_down;
+
+ /* combine the user config with event to determine if permanent
+ * addresses are to be removed from address hash table
+ */
+ keep_addr = !(how || _keep_addr <= 0);
+
/* Step 2: clear hash table */
for (i = 0; i < IN6_ADDR_HSIZE; i++) {
struct hlist_head *h = &inet6_addr_lst[i];
@@ -3352,9 +3490,15 @@ static int addrconf_ifdown(struct net_device *dev, int how)
restart:
hlist_for_each_entry_rcu(ifa, h, addr_lst) {
if (ifa->idev == idev) {
- hlist_del_init_rcu(&ifa->addr_lst);
addrconf_del_dad_work(ifa);
- goto restart;
+ /* combined flag + permanent flag decide if
+ * address is retained on a down event
+ */
+ if (!keep_addr ||
+ !(ifa->flags & IFA_F_PERMANENT)) {
+ hlist_del_init_rcu(&ifa->addr_lst);
+ goto restart;
+ }
}
}
spin_unlock_bh(&addrconf_hash_lock);
@@ -3388,31 +3532,53 @@ restart:
write_lock_bh(&idev->lock);
}
- while (!list_empty(&idev->addr_list)) {
- ifa = list_first_entry(&idev->addr_list,
- struct inet6_ifaddr, if_list);
- addrconf_del_dad_work(ifa);
+ /* re-combine the user config with event to determine if permanent
+ * addresses are to be removed from the interface list
+ */
+ keep_addr = (!how && _keep_addr > 0);
- list_del(&ifa->if_list);
+ INIT_LIST_HEAD(&del_list);
+ list_for_each_entry_safe(ifa, tmp, &idev->addr_list, if_list) {
+ addrconf_del_dad_work(ifa);
write_unlock_bh(&idev->lock);
-
spin_lock_bh(&ifa->lock);
- state = ifa->state;
- ifa->state = INET6_IFADDR_STATE_DEAD;
+
+ if (keep_addr && (ifa->flags & IFA_F_PERMANENT)) {
+ /* set state to skip the notifier below */
+ state = INET6_IFADDR_STATE_DEAD;
+ ifa->state = 0;
+ if (!(ifa->flags & IFA_F_NODAD))
+ ifa->flags |= IFA_F_TENTATIVE;
+ } else {
+ state = ifa->state;
+ ifa->state = INET6_IFADDR_STATE_DEAD;
+
+ list_del(&ifa->if_list);
+ list_add(&ifa->if_list, &del_list);
+ }
+
spin_unlock_bh(&ifa->lock);
if (state != INET6_IFADDR_STATE_DEAD) {
__ipv6_ifa_notify(RTM_DELADDR, ifa);
inet6addr_notifier_call_chain(NETDEV_DOWN, ifa);
}
- in6_ifa_put(ifa);
write_lock_bh(&idev->lock);
}
write_unlock_bh(&idev->lock);
+ /* now clean up addresses to be removed */
+ while (!list_empty(&del_list)) {
+ ifa = list_first_entry(&del_list,
+ struct inet6_ifaddr, if_list);
+ list_del(&ifa->if_list);
+
+ in6_ifa_put(ifa);
+ }
+
/* Step 5: Discard anycast and multicast list */
if (how) {
ipv6_ac_destroy_dev(idev);
@@ -3499,6 +3665,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
{
struct inet6_dev *idev = ifp->idev;
struct net_device *dev = idev->dev;
+ bool notify = false;
addrconf_join_solict(dev, &ifp->addr);
@@ -3544,7 +3711,7 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
/* Because optimistic nodes can use this address,
* notify listeners. If DAD fails, RTM_DELADDR is sent.
*/
- ipv6_ifa_notify(RTM_NEWADDR, ifp);
+ notify = true;
}
}
@@ -3552,6 +3719,8 @@ static void addrconf_dad_begin(struct inet6_ifaddr *ifp)
out:
spin_unlock(&ifp->lock);
read_unlock_bh(&idev->lock);
+ if (notify)
+ ipv6_ifa_notify(RTM_NEWADDR, ifp);
}
static void addrconf_dad_start(struct inet6_ifaddr *ifp)
@@ -3641,7 +3810,7 @@ static void addrconf_dad_work(struct work_struct *w)
/* send a neighbour solicitation for our addr */
addrconf_addr_solict_mult(&ifp->addr, &mcaddr);
- ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any, NULL);
+ ndisc_send_ns(ifp->idev->dev, &ifp->addr, &mcaddr, &in6addr_any);
out:
in6_ifa_put(ifp);
rtnl_unlock();
@@ -4672,6 +4841,9 @@ static inline void ipv6_store_devconf(struct ipv6_devconf *cnf,
array[DEVCONF_IGNORE_ROUTES_WITH_LINKDOWN] = cnf->ignore_routes_with_linkdown;
/* we omit DEVCONF_STABLE_SECRET for now */
array[DEVCONF_USE_OIF_ADDRS_ONLY] = cnf->use_oif_addrs_only;
+ array[DEVCONF_DROP_UNICAST_IN_L2_MULTICAST] = cnf->drop_unicast_in_l2_multicast;
+ array[DEVCONF_DROP_UNSOLICITED_NA] = cnf->drop_unsolicited_na;
+ array[DEVCONF_KEEP_ADDR_ON_DOWN] = cnf->keep_addr_on_down;
}
static inline size_t inet6_ifla6_size(void)
@@ -4920,7 +5092,8 @@ static int inet6_set_link_af(struct net_device *dev, const struct nlattr *nla)
if (mode != IN6_ADDR_GEN_MODE_EUI64 &&
mode != IN6_ADDR_GEN_MODE_NONE &&
- mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY)
+ mode != IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
+ mode != IN6_ADDR_GEN_MODE_RANDOM)
return -EINVAL;
if (mode == IN6_ADDR_GEN_MODE_STABLE_PRIVACY &&
@@ -5199,6 +5372,20 @@ int addrconf_sysctl_forward(struct ctl_table *ctl, int write,
}
static
+int addrconf_sysctl_hop_limit(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp, loff_t *ppos)
+{
+ struct ctl_table lctl;
+ int min_hl = 1, max_hl = 255;
+
+ lctl = *ctl;
+ lctl.extra1 = &min_hl;
+ lctl.extra2 = &max_hl;
+
+ return proc_dointvec_minmax(&lctl, write, buffer, lenp, ppos);
+}
+
+static
int addrconf_sysctl_mtu(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
@@ -5362,13 +5549,10 @@ static int addrconf_sysctl_stable_secret(struct ctl_table *ctl, int write,
goto out;
}
- if (!write) {
- err = snprintf(str, sizeof(str), "%pI6",
- &secret->secret);
- if (err >= sizeof(str)) {
- err = -EIO;
- goto out;
- }
+ err = snprintf(str, sizeof(str), "%pI6", &secret->secret);
+ if (err >= sizeof(str)) {
+ err = -EIO;
+ goto out;
}
err = proc_dostring(&lctl, write, buffer, lenp, ppos);
@@ -5453,7 +5637,7 @@ static struct addrconf_sysctl_table
.data = &ipv6_devconf.hop_limit,
.maxlen = sizeof(int),
.mode = 0644,
- .proc_handler = proc_dointvec,
+ .proc_handler = addrconf_sysctl_hop_limit,
},
{
.procname = "mtu",
@@ -5734,6 +5918,28 @@ static struct addrconf_sysctl_table
.proc_handler = addrconf_sysctl_ignore_routes_with_linkdown,
},
{
+ .procname = "drop_unicast_in_l2_multicast",
+ .data = &ipv6_devconf.drop_unicast_in_l2_multicast,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "drop_unsolicited_na",
+ .data = &ipv6_devconf.drop_unsolicited_na,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
+ {
+ .procname = "keep_addr_on_down",
+ .data = &ipv6_devconf.keep_addr_on_down,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+
+ },
+ {
/* sentinel */
}
},
diff --git a/net/ipv6/addrlabel.c b/net/ipv6/addrlabel.c
index 882124ebb438..a8f6986dcbe5 100644
--- a/net/ipv6/addrlabel.c
+++ b/net/ipv6/addrlabel.c
@@ -552,7 +552,7 @@ static int ip6addrlbl_get(struct sk_buff *in_skb, struct nlmsghdr *nlh)
rcu_read_lock();
p = __ipv6_addr_label(net, addr, ipv6_addr_type(addr), ifal->ifal_index);
- if (p && ip6addrlbl_hold(p))
+ if (p && !ip6addrlbl_hold(p))
p = NULL;
lseq = ip6addrlbl_table.seq;
rcu_read_unlock();
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 44bb66bde0e2..b11c37cfd67c 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -109,6 +109,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
int try_loading_module = 0;
int err;
+ if (protocol < 0 || protocol >= IPPROTO_MAX)
+ return -EINVAL;
+
/* Look for the requested type/protocol pair. */
lookup_protocol:
err = -ESOCKTNOSUPPORT;
@@ -232,7 +235,11 @@ lookup_protocol:
* creation time automatically shares.
*/
inet->inet_sport = htons(inet->inet_num);
- sk->sk_prot->hash(sk);
+ err = sk->sk_prot->hash(sk);
+ if (err) {
+ sk_common_release(sk);
+ goto out;
+ }
}
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);
@@ -428,9 +435,11 @@ void inet6_destroy_sock(struct sock *sk)
/* Free tx options */
- opt = xchg(&np->opt, NULL);
- if (opt)
- sock_kfree_s(sk, opt, opt->tot_len);
+ opt = xchg((__force struct ipv6_txoptions **)&np->opt, NULL);
+ if (opt) {
+ atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+ txopt_put(opt);
+ }
}
EXPORT_SYMBOL_GPL(inet6_destroy_sock);
@@ -659,7 +668,10 @@ int inet6_sk_rebuild_header(struct sock *sk)
fl6.fl6_sport = inet->inet_sport;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- final_p = fl6_update_dst(&fl6, np->opt, &final);
+ rcu_read_lock();
+ final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt),
+ &final);
+ rcu_read_unlock();
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
if (IS_ERR(dst)) {
@@ -668,7 +680,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
return PTR_ERR(dst);
}
- __ip6_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, NULL, NULL);
}
return 0;
diff --git a/net/ipv6/datagram.c b/net/ipv6/datagram.c
index d70b0238f468..428162155280 100644
--- a/net/ipv6/datagram.c
+++ b/net/ipv6/datagram.c
@@ -162,13 +162,18 @@ ipv4_connected:
fl6.fl6_dport = inet->inet_dport;
fl6.fl6_sport = inet->inet_sport;
+ if (!fl6.flowi6_oif)
+ fl6.flowi6_oif = np->sticky_pktinfo.ipi6_ifindex;
+
if (!fl6.flowi6_oif && (addr_type&IPV6_ADDR_MULTICAST))
fl6.flowi6_oif = np->mcast_oif;
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
- opt = flowlabel ? flowlabel->opt : np->opt;
+ rcu_read_lock();
+ opt = flowlabel ? flowlabel->opt : rcu_dereference(np->opt);
final_p = fl6_update_dst(&fl6, opt, &final);
+ rcu_read_unlock();
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
err = 0;
diff --git a/net/ipv6/exthdrs.c b/net/ipv6/exthdrs.c
index ce203b0402be..ea7c4d64a00a 100644
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -727,6 +727,7 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
*((char **)&opt2->dst1opt) += dif;
if (opt2->srcrt)
*((char **)&opt2->srcrt) += dif;
+ atomic_set(&opt2->refcnt, 1);
}
return opt2;
}
@@ -790,7 +791,7 @@ ipv6_renew_options(struct sock *sk, struct ipv6_txoptions *opt,
return ERR_PTR(-ENOBUFS);
memset(opt2, 0, tot_len);
-
+ atomic_set(&opt2->refcnt, 1);
opt2->tot_len = tot_len;
p = (char *)(opt2 + 1);
diff --git a/net/ipv6/exthdrs_core.c b/net/ipv6/exthdrs_core.c
index 5c5d23e59da5..9508a20fbf61 100644
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
@@ -257,7 +257,11 @@ int ipv6_find_hdr(const struct sk_buff *skb, unsigned int *offset,
*fragoff = _frag_off;
return hp->nexthdr;
}
- return -ENOENT;
+ if (!found)
+ return -ENOENT;
+ if (fragoff)
+ *fragoff = _frag_off;
+ break;
}
hdrlen = 8;
} else if (nexthdr == NEXTHDR_AUTH) {
diff --git a/net/ipv6/icmp.c b/net/ipv6/icmp.c
index 36c5a98b0472..0a37ddc7af51 100644
--- a/net/ipv6/icmp.c
+++ b/net/ipv6/icmp.c
@@ -834,11 +834,6 @@ void icmpv6_flow_init(struct sock *sk, struct flowi6 *fl6,
security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
}
-/*
- * Special lock-class for __icmpv6_sk:
- */
-static struct lock_class_key icmpv6_socket_sk_dst_lock_key;
-
static int __net_init icmpv6_sk_init(struct net *net)
{
struct sock *sk;
@@ -860,15 +855,6 @@ static int __net_init icmpv6_sk_init(struct net *net)
net->ipv6.icmp_sk[i] = sk;
- /*
- * Split off their lock-class, because sk->sk_dst_lock
- * gets used from softirqs, which is safe for
- * __icmpv6_sk (because those never get directly used
- * via userspace syscalls), but unsafe for normal sockets.
- */
- lockdep_set_class(&sk->sk_dst_lock,
- &icmpv6_socket_sk_dst_lock_key);
-
/* Enough space for 2 64K ICMP packets, including
* sk_buff struct overhead.
*/
diff --git a/net/ipv6/ila/Makefile b/net/ipv6/ila/Makefile
new file mode 100644
index 000000000000..4b32e5921e5c
--- /dev/null
+++ b/net/ipv6/ila/Makefile
@@ -0,0 +1,7 @@
+#
+# Makefile for ILA module
+#
+
+obj-$(CONFIG_IPV6_ILA) += ila.o
+
+ila-objs := ila_common.o ila_lwt.o ila_xlat.o
diff --git a/net/ipv6/ila/ila.h b/net/ipv6/ila/ila.h
new file mode 100644
index 000000000000..28542cb2b387
--- /dev/null
+++ b/net/ipv6/ila/ila.h
@@ -0,0 +1,48 @@
+/*
+ * Copyright (c) 2015 Tom Herbert <tom@herbertland.com>
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License as
+ * published by the Free Software Foundation; either version 2 of
+ * the License, or (at your option) any later version.
+ *
+ */
+
+#ifndef __ILA_H
+#define __ILA_H
+
+#include <linux/errno.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/socket.h>
+#include <linux/skbuff.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/protocol.h>
+#include <uapi/linux/ila.h>
+
+struct ila_params {
+ __be64 locator;
+ __be64 locator_match;
+ __wsum csum_diff;
+};
+
+static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to)
+{
+ __be32 diff[] = {
+ ~from[0], ~from[1], to[0], to[1],
+ };
+
+ return csum_partial(diff, sizeof(diff), 0);
+}
+
+void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p);
+
+int ila_lwt_init(void);
+void ila_lwt_fini(void);
+int ila_xlat_init(void);
+void ila_xlat_fini(void);
+
+#endif /* __ILA_H */
diff --git a/net/ipv6/ila/ila_common.c b/net/ipv6/ila/ila_common.c
new file mode 100644
index 000000000000..30613050e4ca
--- /dev/null
+++ b/net/ipv6/ila/ila_common.c
@@ -0,0 +1,104 @@
+#include <linux/errno.h>
+#include <linux/ip.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/types.h>
+#include <net/checksum.h>
+#include <net/ip.h>
+#include <net/ip6_fib.h>
+#include <net/lwtunnel.h>
+#include <net/protocol.h>
+#include <uapi/linux/ila.h>
+#include "ila.h"
+
+static __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p)
+{
+ if (*(__be64 *)&ip6h->daddr == p->locator_match)
+ return p->csum_diff;
+ else
+ return compute_csum_diff8((__be32 *)&ip6h->daddr,
+ (__be32 *)&p->locator);
+}
+
+void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p)
+{
+ __wsum diff;
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ size_t nhoff = sizeof(struct ipv6hdr);
+
+ /* First update checksum */
+ switch (ip6h->nexthdr) {
+ case NEXTHDR_TCP:
+ if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) {
+ struct tcphdr *th = (struct tcphdr *)
+ (skb_network_header(skb) + nhoff);
+
+ diff = get_csum_diff(ip6h, p);
+ inet_proto_csum_replace_by_diff(&th->check, skb,
+ diff, true);
+ }
+ break;
+ case NEXTHDR_UDP:
+ if (likely(pskb_may_pull(skb, nhoff + sizeof(struct udphdr)))) {
+ struct udphdr *uh = (struct udphdr *)
+ (skb_network_header(skb) + nhoff);
+
+ if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
+ diff = get_csum_diff(ip6h, p);
+ inet_proto_csum_replace_by_diff(&uh->check, skb,
+ diff, true);
+ if (!uh->check)
+ uh->check = CSUM_MANGLED_0;
+ }
+ }
+ break;
+ case NEXTHDR_ICMP:
+ if (likely(pskb_may_pull(skb,
+ nhoff + sizeof(struct icmp6hdr)))) {
+ struct icmp6hdr *ih = (struct icmp6hdr *)
+ (skb_network_header(skb) + nhoff);
+
+ diff = get_csum_diff(ip6h, p);
+ inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb,
+ diff, true);
+ }
+ break;
+ }
+
+ /* Now change destination address */
+ *(__be64 *)&ip6h->daddr = p->locator;
+}
+
+static int __init ila_init(void)
+{
+ int ret;
+
+ ret = ila_lwt_init();
+
+ if (ret)
+ goto fail_lwt;
+
+ ret = ila_xlat_init();
+ if (ret)
+ goto fail_xlat;
+
+ return 0;
+fail_xlat:
+ ila_lwt_fini();
+fail_lwt:
+ return ret;
+}
+
+static void __exit ila_fini(void)
+{
+ ila_xlat_fini();
+ ila_lwt_fini();
+}
+
+module_init(ila_init);
+module_exit(ila_fini);
+MODULE_ALIAS_RTNL_LWT(ILA);
+MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
+MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ila.c b/net/ipv6/ila/ila_lwt.c
index 1a6852e1ac69..2ae3c4fd8aab 100644
--- a/net/ipv6/ila.c
+++ b/net/ipv6/ila/ila_lwt.c
@@ -11,12 +11,7 @@
#include <net/lwtunnel.h>
#include <net/protocol.h>
#include <uapi/linux/ila.h>
-
-struct ila_params {
- __be64 locator;
- __be64 locator_match;
- __wsum csum_diff;
-};
+#include "ila.h"
static inline struct ila_params *ila_params_lwtunnel(
struct lwtunnel_state *lwstate)
@@ -24,73 +19,6 @@ static inline struct ila_params *ila_params_lwtunnel(
return (struct ila_params *)lwstate->data;
}
-static inline __wsum compute_csum_diff8(const __be32 *from, const __be32 *to)
-{
- __be32 diff[] = {
- ~from[0], ~from[1], to[0], to[1],
- };
-
- return csum_partial(diff, sizeof(diff), 0);
-}
-
-static inline __wsum get_csum_diff(struct ipv6hdr *ip6h, struct ila_params *p)
-{
- if (*(__be64 *)&ip6h->daddr == p->locator_match)
- return p->csum_diff;
- else
- return compute_csum_diff8((__be32 *)&ip6h->daddr,
- (__be32 *)&p->locator);
-}
-
-static void update_ipv6_locator(struct sk_buff *skb, struct ila_params *p)
-{
- __wsum diff;
- struct ipv6hdr *ip6h = ipv6_hdr(skb);
- size_t nhoff = sizeof(struct ipv6hdr);
-
- /* First update checksum */
- switch (ip6h->nexthdr) {
- case NEXTHDR_TCP:
- if (likely(pskb_may_pull(skb, nhoff + sizeof(struct tcphdr)))) {
- struct tcphdr *th = (struct tcphdr *)
- (skb_network_header(skb) + nhoff);
-
- diff = get_csum_diff(ip6h, p);
- inet_proto_csum_replace_by_diff(&th->check, skb,
- diff, true);
- }
- break;
- case NEXTHDR_UDP:
- if (likely(pskb_may_pull(skb, nhoff + sizeof(struct udphdr)))) {
- struct udphdr *uh = (struct udphdr *)
- (skb_network_header(skb) + nhoff);
-
- if (uh->check || skb->ip_summed == CHECKSUM_PARTIAL) {
- diff = get_csum_diff(ip6h, p);
- inet_proto_csum_replace_by_diff(&uh->check, skb,
- diff, true);
- if (!uh->check)
- uh->check = CSUM_MANGLED_0;
- }
- }
- break;
- case NEXTHDR_ICMP:
- if (likely(pskb_may_pull(skb,
- nhoff + sizeof(struct icmp6hdr)))) {
- struct icmp6hdr *ih = (struct icmp6hdr *)
- (skb_network_header(skb) + nhoff);
-
- diff = get_csum_diff(ip6h, p);
- inet_proto_csum_replace_by_diff(&ih->icmp6_cksum, skb,
- diff, true);
- }
- break;
- }
-
- /* Now change destination address */
- *(__be64 *)&ip6h->daddr = p->locator;
-}
-
static int ila_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
@@ -213,17 +141,12 @@ static const struct lwtunnel_encap_ops ila_encap_ops = {
.cmp_encap = ila_encap_cmp,
};
-static int __init ila_init(void)
+int ila_lwt_init(void)
{
return lwtunnel_encap_add_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA);
}
-static void __exit ila_fini(void)
+void ila_lwt_fini(void)
{
lwtunnel_encap_del_ops(&ila_encap_ops, LWTUNNEL_ENCAP_ILA);
}
-
-module_init(ila_init);
-module_exit(ila_fini);
-MODULE_AUTHOR("Tom Herbert <tom@herbertland.com>");
-MODULE_LICENSE("GPL");
diff --git a/net/ipv6/ila/ila_xlat.c b/net/ipv6/ila/ila_xlat.c
new file mode 100644
index 000000000000..295ca29a23c3
--- /dev/null
+++ b/net/ipv6/ila/ila_xlat.c
@@ -0,0 +1,680 @@
+#include <linux/jhash.h>
+#include <linux/netfilter.h>
+#include <linux/rcupdate.h>
+#include <linux/rhashtable.h>
+#include <linux/vmalloc.h>
+#include <net/genetlink.h>
+#include <net/ila.h>
+#include <net/netns/generic.h>
+#include <uapi/linux/genetlink.h>
+#include "ila.h"
+
+struct ila_xlat_params {
+ struct ila_params ip;
+ __be64 identifier;
+ int ifindex;
+ unsigned int dir;
+};
+
+struct ila_map {
+ struct ila_xlat_params p;
+ struct rhash_head node;
+ struct ila_map __rcu *next;
+ struct rcu_head rcu;
+};
+
+static unsigned int ila_net_id;
+
+struct ila_net {
+ struct rhashtable rhash_table;
+ spinlock_t *locks; /* Bucket locks for entry manipulation */
+ unsigned int locks_mask;
+ bool hooks_registered;
+};
+
+#define LOCKS_PER_CPU 10
+
+static int alloc_ila_locks(struct ila_net *ilan)
+{
+ unsigned int i, size;
+ unsigned int nr_pcpus = num_possible_cpus();
+
+ nr_pcpus = min_t(unsigned int, nr_pcpus, 32UL);
+ size = roundup_pow_of_two(nr_pcpus * LOCKS_PER_CPU);
+
+ if (sizeof(spinlock_t) != 0) {
+#ifdef CONFIG_NUMA
+ if (size * sizeof(spinlock_t) > PAGE_SIZE)
+ ilan->locks = vmalloc(size * sizeof(spinlock_t));
+ else
+#endif
+ ilan->locks = kmalloc_array(size, sizeof(spinlock_t),
+ GFP_KERNEL);
+ if (!ilan->locks)
+ return -ENOMEM;
+ for (i = 0; i < size; i++)
+ spin_lock_init(&ilan->locks[i]);
+ }
+ ilan->locks_mask = size - 1;
+
+ return 0;
+}
+
+static u32 hashrnd __read_mostly;
+static __always_inline void __ila_hash_secret_init(void)
+{
+ net_get_random_once(&hashrnd, sizeof(hashrnd));
+}
+
+static inline u32 ila_identifier_hash(__be64 identifier)
+{
+ u32 *v = (u32 *)&identifier;
+
+ return jhash_2words(v[0], v[1], hashrnd);
+}
+
+static inline spinlock_t *ila_get_lock(struct ila_net *ilan, __be64 identifier)
+{
+ return &ilan->locks[ila_identifier_hash(identifier) & ilan->locks_mask];
+}
+
+static inline int ila_cmp_wildcards(struct ila_map *ila, __be64 loc,
+ int ifindex, unsigned int dir)
+{
+ return (ila->p.ip.locator_match && ila->p.ip.locator_match != loc) ||
+ (ila->p.ifindex && ila->p.ifindex != ifindex) ||
+ !(ila->p.dir & dir);
+}
+
+static inline int ila_cmp_params(struct ila_map *ila, struct ila_xlat_params *p)
+{
+ return (ila->p.ip.locator_match != p->ip.locator_match) ||
+ (ila->p.ifindex != p->ifindex) ||
+ (ila->p.dir != p->dir);
+}
+
+static int ila_cmpfn(struct rhashtable_compare_arg *arg,
+ const void *obj)
+{
+ const struct ila_map *ila = obj;
+
+ return (ila->p.identifier != *(__be64 *)arg->key);
+}
+
+static inline int ila_order(struct ila_map *ila)
+{
+ int score = 0;
+
+ if (ila->p.ip.locator_match)
+ score += 1 << 0;
+
+ if (ila->p.ifindex)
+ score += 1 << 1;
+
+ return score;
+}
+
+static const struct rhashtable_params rht_params = {
+ .nelem_hint = 1024,
+ .head_offset = offsetof(struct ila_map, node),
+ .key_offset = offsetof(struct ila_map, p.identifier),
+ .key_len = sizeof(u64), /* identifier */
+ .max_size = 1048576,
+ .min_size = 256,
+ .automatic_shrinking = true,
+ .obj_cmpfn = ila_cmpfn,
+};
+
+static struct genl_family ila_nl_family = {
+ .id = GENL_ID_GENERATE,
+ .hdrsize = 0,
+ .name = ILA_GENL_NAME,
+ .version = ILA_GENL_VERSION,
+ .maxattr = ILA_ATTR_MAX,
+ .netnsok = true,
+ .parallel_ops = true,
+};
+
+static struct nla_policy ila_nl_policy[ILA_ATTR_MAX + 1] = {
+ [ILA_ATTR_IDENTIFIER] = { .type = NLA_U64, },
+ [ILA_ATTR_LOCATOR] = { .type = NLA_U64, },
+ [ILA_ATTR_LOCATOR_MATCH] = { .type = NLA_U64, },
+ [ILA_ATTR_IFINDEX] = { .type = NLA_U32, },
+ [ILA_ATTR_DIR] = { .type = NLA_U32, },
+};
+
+static int parse_nl_config(struct genl_info *info,
+ struct ila_xlat_params *p)
+{
+ memset(p, 0, sizeof(*p));
+
+ if (info->attrs[ILA_ATTR_IDENTIFIER])
+ p->identifier = (__force __be64)nla_get_u64(
+ info->attrs[ILA_ATTR_IDENTIFIER]);
+
+ if (info->attrs[ILA_ATTR_LOCATOR])
+ p->ip.locator = (__force __be64)nla_get_u64(
+ info->attrs[ILA_ATTR_LOCATOR]);
+
+ if (info->attrs[ILA_ATTR_LOCATOR_MATCH])
+ p->ip.locator_match = (__force __be64)nla_get_u64(
+ info->attrs[ILA_ATTR_LOCATOR_MATCH]);
+
+ if (info->attrs[ILA_ATTR_IFINDEX])
+ p->ifindex = nla_get_s32(info->attrs[ILA_ATTR_IFINDEX]);
+
+ if (info->attrs[ILA_ATTR_DIR])
+ p->dir = nla_get_u32(info->attrs[ILA_ATTR_DIR]);
+
+ return 0;
+}
+
+/* Must be called with rcu readlock */
+static inline struct ila_map *ila_lookup_wildcards(__be64 id, __be64 loc,
+ int ifindex,
+ unsigned int dir,
+ struct ila_net *ilan)
+{
+ struct ila_map *ila;
+
+ ila = rhashtable_lookup_fast(&ilan->rhash_table, &id, rht_params);
+ while (ila) {
+ if (!ila_cmp_wildcards(ila, loc, ifindex, dir))
+ return ila;
+ ila = rcu_access_pointer(ila->next);
+ }
+
+ return NULL;
+}
+
+/* Must be called with rcu readlock */
+static inline struct ila_map *ila_lookup_by_params(struct ila_xlat_params *p,
+ struct ila_net *ilan)
+{
+ struct ila_map *ila;
+
+ ila = rhashtable_lookup_fast(&ilan->rhash_table, &p->identifier,
+ rht_params);
+ while (ila) {
+ if (!ila_cmp_params(ila, p))
+ return ila;
+ ila = rcu_access_pointer(ila->next);
+ }
+
+ return NULL;
+}
+
+static inline void ila_release(struct ila_map *ila)
+{
+ kfree_rcu(ila, rcu);
+}
+
+static void ila_free_cb(void *ptr, void *arg)
+{
+ struct ila_map *ila = (struct ila_map *)ptr, *next;
+
+ /* Assume rcu_readlock held */
+ while (ila) {
+ next = rcu_access_pointer(ila->next);
+ ila_release(ila);
+ ila = next;
+ }
+}
+
+static int ila_xlat_addr(struct sk_buff *skb, int dir);
+
+static unsigned int
+ila_nf_input(void *priv,
+ struct sk_buff *skb,
+ const struct nf_hook_state *state)
+{
+ ila_xlat_addr(skb, ILA_DIR_IN);
+ return NF_ACCEPT;
+}
+
+static struct nf_hook_ops ila_nf_hook_ops[] __read_mostly = {
+ {
+ .hook = ila_nf_input,
+ .pf = NFPROTO_IPV6,
+ .hooknum = NF_INET_PRE_ROUTING,
+ .priority = -1,
+ },
+};
+
+static int ila_add_mapping(struct net *net, struct ila_xlat_params *p)
+{
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+ struct ila_map *ila, *head;
+ spinlock_t *lock = ila_get_lock(ilan, p->identifier);
+ int err = 0, order;
+
+ if (!ilan->hooks_registered) {
+ /* We defer registering net hooks in the namespace until the
+ * first mapping is added.
+ */
+ err = nf_register_net_hooks(net, ila_nf_hook_ops,
+ ARRAY_SIZE(ila_nf_hook_ops));
+ if (err)
+ return err;
+
+ ilan->hooks_registered = true;
+ }
+
+ ila = kzalloc(sizeof(*ila), GFP_KERNEL);
+ if (!ila)
+ return -ENOMEM;
+
+ ila->p = *p;
+
+ if (p->ip.locator_match) {
+ /* Precompute checksum difference for translation since we
+ * know both the old identifier and the new one.
+ */
+ ila->p.ip.csum_diff = compute_csum_diff8(
+ (__be32 *)&p->ip.locator_match,
+ (__be32 *)&p->ip.locator);
+ }
+
+ order = ila_order(ila);
+
+ spin_lock(lock);
+
+ head = rhashtable_lookup_fast(&ilan->rhash_table, &p->identifier,
+ rht_params);
+ if (!head) {
+ /* New entry for the rhash_table */
+ err = rhashtable_lookup_insert_fast(&ilan->rhash_table,
+ &ila->node, rht_params);
+ } else {
+ struct ila_map *tila = head, *prev = NULL;
+
+ do {
+ if (!ila_cmp_params(tila, p)) {
+ err = -EEXIST;
+ goto out;
+ }
+
+ if (order > ila_order(tila))
+ break;
+
+ prev = tila;
+ tila = rcu_dereference_protected(tila->next,
+ lockdep_is_held(lock));
+ } while (tila);
+
+ if (prev) {
+ /* Insert in sub list of head */
+ RCU_INIT_POINTER(ila->next, tila);
+ rcu_assign_pointer(prev->next, ila);
+ } else {
+ /* Make this ila new head */
+ RCU_INIT_POINTER(ila->next, head);
+ err = rhashtable_replace_fast(&ilan->rhash_table,
+ &head->node,
+ &ila->node, rht_params);
+ if (err)
+ goto out;
+ }
+ }
+
+out:
+ spin_unlock(lock);
+
+ if (err)
+ kfree(ila);
+
+ return err;
+}
+
+static int ila_del_mapping(struct net *net, struct ila_xlat_params *p)
+{
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+ struct ila_map *ila, *head, *prev;
+ spinlock_t *lock = ila_get_lock(ilan, p->identifier);
+ int err = -ENOENT;
+
+ spin_lock(lock);
+
+ head = rhashtable_lookup_fast(&ilan->rhash_table,
+ &p->identifier, rht_params);
+ ila = head;
+
+ prev = NULL;
+
+ while (ila) {
+ if (ila_cmp_params(ila, p)) {
+ prev = ila;
+ ila = rcu_dereference_protected(ila->next,
+ lockdep_is_held(lock));
+ continue;
+ }
+
+ err = 0;
+
+ if (prev) {
+ /* Not head, just delete from list */
+ rcu_assign_pointer(prev->next, ila->next);
+ } else {
+ /* It is the head. If there is something in the
+ * sublist we need to make a new head.
+ */
+ head = rcu_dereference_protected(ila->next,
+ lockdep_is_held(lock));
+ if (head) {
+ /* Put first entry in the sublist into the
+ * table
+ */
+ err = rhashtable_replace_fast(
+ &ilan->rhash_table, &ila->node,
+ &head->node, rht_params);
+ if (err)
+ goto out;
+ } else {
+ /* Entry no longer used */
+ err = rhashtable_remove_fast(&ilan->rhash_table,
+ &ila->node,
+ rht_params);
+ }
+ }
+
+ ila_release(ila);
+
+ break;
+ }
+
+out:
+ spin_unlock(lock);
+
+ return err;
+}
+
+static int ila_nl_cmd_add_mapping(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct ila_xlat_params p;
+ int err;
+
+ err = parse_nl_config(info, &p);
+ if (err)
+ return err;
+
+ return ila_add_mapping(net, &p);
+}
+
+static int ila_nl_cmd_del_mapping(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct ila_xlat_params p;
+ int err;
+
+ err = parse_nl_config(info, &p);
+ if (err)
+ return err;
+
+ ila_del_mapping(net, &p);
+
+ return 0;
+}
+
+static int ila_fill_info(struct ila_map *ila, struct sk_buff *msg)
+{
+ if (nla_put_u64(msg, ILA_ATTR_IDENTIFIER,
+ (__force u64)ila->p.identifier) ||
+ nla_put_u64(msg, ILA_ATTR_LOCATOR,
+ (__force u64)ila->p.ip.locator) ||
+ nla_put_u64(msg, ILA_ATTR_LOCATOR_MATCH,
+ (__force u64)ila->p.ip.locator_match) ||
+ nla_put_s32(msg, ILA_ATTR_IFINDEX, ila->p.ifindex) ||
+ nla_put_u32(msg, ILA_ATTR_DIR, ila->p.dir))
+ return -1;
+
+ return 0;
+}
+
+static int ila_dump_info(struct ila_map *ila,
+ u32 portid, u32 seq, u32 flags,
+ struct sk_buff *skb, u8 cmd)
+{
+ void *hdr;
+
+ hdr = genlmsg_put(skb, portid, seq, &ila_nl_family, flags, cmd);
+ if (!hdr)
+ return -ENOMEM;
+
+ if (ila_fill_info(ila, skb) < 0)
+ goto nla_put_failure;
+
+ genlmsg_end(skb, hdr);
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(skb, hdr);
+ return -EMSGSIZE;
+}
+
+static int ila_nl_cmd_get_mapping(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+ struct sk_buff *msg;
+ struct ila_xlat_params p;
+ struct ila_map *ila;
+ int ret;
+
+ ret = parse_nl_config(info, &p);
+ if (ret)
+ return ret;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ rcu_read_lock();
+
+ ila = ila_lookup_by_params(&p, ilan);
+ if (ila) {
+ ret = ila_dump_info(ila,
+ info->snd_portid,
+ info->snd_seq, 0, msg,
+ info->genlhdr->cmd);
+ }
+
+ rcu_read_unlock();
+
+ if (ret < 0)
+ goto out_free;
+
+ return genlmsg_reply(msg, info);
+
+out_free:
+ nlmsg_free(msg);
+ return ret;
+}
+
+struct ila_dump_iter {
+ struct rhashtable_iter rhiter;
+};
+
+static int ila_nl_dump_start(struct netlink_callback *cb)
+{
+ struct net *net = sock_net(cb->skb->sk);
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+ struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args;
+
+ return rhashtable_walk_init(&ilan->rhash_table, &iter->rhiter);
+}
+
+static int ila_nl_dump_done(struct netlink_callback *cb)
+{
+ struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args;
+
+ rhashtable_walk_exit(&iter->rhiter);
+
+ return 0;
+}
+
+static int ila_nl_dump(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct ila_dump_iter *iter = (struct ila_dump_iter *)cb->args;
+ struct rhashtable_iter *rhiter = &iter->rhiter;
+ struct ila_map *ila;
+ int ret;
+
+ ret = rhashtable_walk_start(rhiter);
+ if (ret && ret != -EAGAIN)
+ goto done;
+
+ for (;;) {
+ ila = rhashtable_walk_next(rhiter);
+
+ if (IS_ERR(ila)) {
+ if (PTR_ERR(ila) == -EAGAIN)
+ continue;
+ ret = PTR_ERR(ila);
+ goto done;
+ } else if (!ila) {
+ break;
+ }
+
+ while (ila) {
+ ret = ila_dump_info(ila, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq, NLM_F_MULTI,
+ skb, ILA_CMD_GET);
+ if (ret)
+ goto done;
+
+ ila = rcu_access_pointer(ila->next);
+ }
+ }
+
+ ret = skb->len;
+
+done:
+ rhashtable_walk_stop(rhiter);
+ return ret;
+}
+
+static const struct genl_ops ila_nl_ops[] = {
+ {
+ .cmd = ILA_CMD_ADD,
+ .doit = ila_nl_cmd_add_mapping,
+ .policy = ila_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = ILA_CMD_DEL,
+ .doit = ila_nl_cmd_del_mapping,
+ .policy = ila_nl_policy,
+ .flags = GENL_ADMIN_PERM,
+ },
+ {
+ .cmd = ILA_CMD_GET,
+ .doit = ila_nl_cmd_get_mapping,
+ .start = ila_nl_dump_start,
+ .dumpit = ila_nl_dump,
+ .done = ila_nl_dump_done,
+ .policy = ila_nl_policy,
+ },
+};
+
+#define ILA_HASH_TABLE_SIZE 1024
+
+static __net_init int ila_init_net(struct net *net)
+{
+ int err;
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+
+ err = alloc_ila_locks(ilan);
+ if (err)
+ return err;
+
+ rhashtable_init(&ilan->rhash_table, &rht_params);
+
+ return 0;
+}
+
+static __net_exit void ila_exit_net(struct net *net)
+{
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+
+ rhashtable_free_and_destroy(&ilan->rhash_table, ila_free_cb, NULL);
+
+ kvfree(ilan->locks);
+
+ if (ilan->hooks_registered)
+ nf_unregister_net_hooks(net, ila_nf_hook_ops,
+ ARRAY_SIZE(ila_nf_hook_ops));
+}
+
+static struct pernet_operations ila_net_ops = {
+ .init = ila_init_net,
+ .exit = ila_exit_net,
+ .id = &ila_net_id,
+ .size = sizeof(struct ila_net),
+};
+
+static int ila_xlat_addr(struct sk_buff *skb, int dir)
+{
+ struct ila_map *ila;
+ struct ipv6hdr *ip6h = ipv6_hdr(skb);
+ struct net *net = dev_net(skb->dev);
+ struct ila_net *ilan = net_generic(net, ila_net_id);
+ __be64 identifier, locator_match;
+ size_t nhoff;
+
+ /* Assumes skb contains a valid IPv6 header that is pulled */
+
+ identifier = *(__be64 *)&ip6h->daddr.in6_u.u6_addr8[8];
+ locator_match = *(__be64 *)&ip6h->daddr.in6_u.u6_addr8[0];
+ nhoff = sizeof(struct ipv6hdr);
+
+ rcu_read_lock();
+
+ ila = ila_lookup_wildcards(identifier, locator_match,
+ skb->dev->ifindex, dir, ilan);
+ if (ila)
+ update_ipv6_locator(skb, &ila->p.ip);
+
+ rcu_read_unlock();
+
+ return 0;
+}
+
+int ila_xlat_incoming(struct sk_buff *skb)
+{
+ return ila_xlat_addr(skb, ILA_DIR_IN);
+}
+EXPORT_SYMBOL(ila_xlat_incoming);
+
+int ila_xlat_outgoing(struct sk_buff *skb)
+{
+ return ila_xlat_addr(skb, ILA_DIR_OUT);
+}
+EXPORT_SYMBOL(ila_xlat_outgoing);
+
+int ila_xlat_init(void)
+{
+ int ret;
+
+ ret = register_pernet_device(&ila_net_ops);
+ if (ret)
+ goto exit;
+
+ ret = genl_register_family_with_ops(&ila_nl_family,
+ ila_nl_ops);
+ if (ret < 0)
+ goto unregister;
+
+ return 0;
+
+unregister:
+ unregister_pernet_device(&ila_net_ops);
+exit:
+ return ret;
+}
+
+void ila_xlat_fini(void)
+{
+ genl_unregister_family(&ila_nl_family);
+ unregister_pernet_device(&ila_net_ops);
+}
diff --git a/net/ipv6/inet6_connection_sock.c b/net/ipv6/inet6_connection_sock.c
index 5d1c7cee2cb2..532c3ef282c5 100644
--- a/net/ipv6/inet6_connection_sock.c
+++ b/net/ipv6/inet6_connection_sock.c
@@ -26,6 +26,7 @@
#include <net/ip6_route.h>
#include <net/sock.h>
#include <net/inet6_connection_sock.h>
+#include <net/sock_reuseport.h>
int inet6_csk_bind_conflict(const struct sock *sk,
const struct inet_bind_bucket *tb, bool relax)
@@ -48,15 +49,16 @@ int inet6_csk_bind_conflict(const struct sock *sk,
if ((!reuse || !sk2->sk_reuse ||
sk2->sk_state == TCP_LISTEN) &&
(!reuseport || !sk2->sk_reuseport ||
+ rcu_access_pointer(sk->sk_reuseport_cb) ||
(sk2->sk_state != TCP_TIME_WAIT &&
!uid_eq(uid,
sock_i_uid((struct sock *)sk2))))) {
- if (ipv6_rcv_saddr_equal(sk, sk2))
+ if (ipv6_rcv_saddr_equal(sk, sk2, true))
break;
}
if (!relax && reuse && sk2->sk_reuse &&
sk2->sk_state != TCP_LISTEN &&
- ipv6_rcv_saddr_equal(sk, sk2))
+ ipv6_rcv_saddr_equal(sk, sk2, true))
break;
}
}
@@ -78,7 +80,9 @@ struct dst_entry *inet6_csk_route_req(const struct sock *sk,
memset(fl6, 0, sizeof(*fl6));
fl6->flowi6_proto = proto;
fl6->daddr = ireq->ir_v6_rmt_addr;
- final_p = fl6_update_dst(fl6, np->opt, &final);
+ rcu_read_lock();
+ final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
+ rcu_read_unlock();
fl6->saddr = ireq->ir_v6_loc_addr;
fl6->flowi6_oif = ireq->ir_iif;
fl6->flowi6_mark = ireq->ir_mark;
@@ -109,14 +113,6 @@ void inet6_csk_addr2sockaddr(struct sock *sk, struct sockaddr *uaddr)
EXPORT_SYMBOL_GPL(inet6_csk_addr2sockaddr);
static inline
-void __inet6_csk_dst_store(struct sock *sk, struct dst_entry *dst,
- const struct in6_addr *daddr,
- const struct in6_addr *saddr)
-{
- __ip6_dst_store(sk, dst, daddr, saddr);
-}
-
-static inline
struct dst_entry *__inet6_csk_dst_check(struct sock *sk, u32 cookie)
{
return __sk_dst_check(sk, cookie);
@@ -142,14 +138,16 @@ static struct dst_entry *inet6_csk_route_socket(struct sock *sk,
fl6->fl6_dport = inet->inet_dport;
security_sk_classify_flow(sk, flowi6_to_flowi(fl6));
- final_p = fl6_update_dst(fl6, np->opt, &final);
+ rcu_read_lock();
+ final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
+ rcu_read_unlock();
dst = __inet6_csk_dst_check(sk, np->dst_cookie);
if (!dst) {
dst = ip6_dst_lookup_flow(sk, fl6, final_p);
if (!IS_ERR(dst))
- __inet6_csk_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, NULL, NULL);
}
return dst;
}
@@ -175,7 +173,8 @@ int inet6_csk_xmit(struct sock *sk, struct sk_buff *skb, struct flowi *fl_unused
/* Restore final destination back after routing done */
fl6.daddr = sk->sk_v6_daddr;
- res = ip6_xmit(sk, skb, &fl6, np->opt, np->tclass);
+ res = ip6_xmit(sk, skb, &fl6, rcu_dereference(np->opt),
+ np->tclass);
rcu_read_unlock();
return res;
}
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 21ace5a2bf7c..70f2628be6fa 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -17,11 +17,13 @@
#include <linux/module.h>
#include <linux/random.h>
+#include <net/addrconf.h>
#include <net/inet_connection_sock.h>
#include <net/inet_hashtables.h>
#include <net/inet6_hashtables.h>
#include <net/secure_seq.h>
#include <net/ip.h>
+#include <net/sock_reuseport.h>
u32 inet6_ehashfn(const struct net *net,
const struct in6_addr *laddr, const u16 lport,
@@ -121,7 +123,9 @@ static inline int compute_score(struct sock *sk, struct net *net,
}
struct sock *inet6_lookup_listener(struct net *net,
- struct inet_hashinfo *hashinfo, const struct in6_addr *saddr,
+ struct inet_hashinfo *hashinfo,
+ struct sk_buff *skb, int doff,
+ const struct in6_addr *saddr,
const __be16 sport, const struct in6_addr *daddr,
const unsigned short hnum, const int dif)
{
@@ -129,6 +133,7 @@ struct sock *inet6_lookup_listener(struct net *net,
const struct hlist_nulls_node *node;
struct sock *result;
int score, hiscore, matches = 0, reuseport = 0;
+ bool select_ok = true;
u32 phash = 0;
unsigned int hash = inet_lhashfn(net, hnum);
struct inet_listen_hashbucket *ilb = &hashinfo->listening_hash[hash];
@@ -146,6 +151,15 @@ begin:
if (reuseport) {
phash = inet6_ehashfn(net, daddr, hnum,
saddr, sport);
+ if (select_ok) {
+ struct sock *sk2;
+ sk2 = reuseport_select_sock(sk, phash,
+ skb, doff);
+ if (sk2) {
+ result = sk2;
+ goto found;
+ }
+ }
matches = 1;
}
} else if (score == hiscore && reuseport) {
@@ -163,11 +177,13 @@ begin:
if (get_nulls_value(node) != hash + LISTENING_NULLS_BASE)
goto begin;
if (result) {
+found:
if (unlikely(!atomic_inc_not_zero(&result->sk_refcnt)))
result = NULL;
else if (unlikely(compute_score(result, net, hnum, daddr,
dif) < hiscore)) {
sock_put(result);
+ select_ok = false;
goto begin;
}
}
@@ -177,6 +193,7 @@ begin:
EXPORT_SYMBOL_GPL(inet6_lookup_listener);
struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
+ struct sk_buff *skb, int doff,
const struct in6_addr *saddr, const __be16 sport,
const struct in6_addr *daddr, const __be16 dport,
const int dif)
@@ -184,7 +201,8 @@ struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
struct sock *sk;
local_bh_disable();
- sk = __inet6_lookup(net, hashinfo, saddr, sport, daddr, ntohs(dport), dif);
+ sk = __inet6_lookup(net, hashinfo, skb, doff, saddr, sport, daddr,
+ ntohs(dport), dif);
local_bh_enable();
return sk;
@@ -274,3 +292,59 @@ int inet6_hash_connect(struct inet_timewait_death_row *death_row,
__inet6_check_established);
}
EXPORT_SYMBOL_GPL(inet6_hash_connect);
+
+int inet6_hash(struct sock *sk)
+{
+ if (sk->sk_state != TCP_CLOSE) {
+ local_bh_disable();
+ __inet_hash(sk, NULL, ipv6_rcv_saddr_equal);
+ local_bh_enable();
+ }
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(inet6_hash);
+
+/* match_wildcard == true: IPV6_ADDR_ANY equals to any IPv6 addresses if IPv6
+ * only, and any IPv4 addresses if not IPv6 only
+ * match_wildcard == false: addresses must be exactly the same, i.e.
+ * IPV6_ADDR_ANY only equals to IPV6_ADDR_ANY,
+ * and 0.0.0.0 equals to 0.0.0.0 only
+ */
+int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+ bool match_wildcard)
+{
+ const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
+ int sk2_ipv6only = inet_v6_ipv6only(sk2);
+ int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
+ int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
+
+ /* if both are mapped, treat as IPv4 */
+ if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED) {
+ if (!sk2_ipv6only) {
+ if (sk->sk_rcv_saddr == sk2->sk_rcv_saddr)
+ return 1;
+ if (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr)
+ return match_wildcard;
+ }
+ return 0;
+ }
+
+ if (addr_type == IPV6_ADDR_ANY && addr_type2 == IPV6_ADDR_ANY)
+ return 1;
+
+ if (addr_type2 == IPV6_ADDR_ANY && match_wildcard &&
+ !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
+ return 1;
+
+ if (addr_type == IPV6_ADDR_ANY && match_wildcard &&
+ !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
+ return 1;
+
+ if (sk2_rcv_saddr6 &&
+ ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
+ return 1;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ipv6_rcv_saddr_equal);
diff --git a/net/ipv6/ip6_checksum.c b/net/ipv6/ip6_checksum.c
index 9a4d7322fb22..b2025bf3da4a 100644
--- a/net/ipv6/ip6_checksum.c
+++ b/net/ipv6/ip6_checksum.c
@@ -6,8 +6,7 @@
#ifndef _HAVE_ARCH_IPV6_CSUM
__sum16 csum_ipv6_magic(const struct in6_addr *saddr,
const struct in6_addr *daddr,
- __u32 len, unsigned short proto,
- __wsum csum)
+ __u32 len, __u8 proto, __wsum csum)
{
int carry;
@@ -98,27 +97,16 @@ void udp6_set_csum(bool nocheck, struct sk_buff *skb,
uh->check = 0;
else if (skb_is_gso(skb))
uh->check = ~udp_v6_check(len, saddr, daddr, 0);
- else if (skb_dst(skb) && skb_dst(skb)->dev &&
- (skb_dst(skb)->dev->features & NETIF_F_IPV6_CSUM)) {
-
- BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
-
+ else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ uh->check = 0;
+ uh->check = udp_v6_check(len, saddr, daddr, lco_csum(skb));
+ if (uh->check == 0)
+ uh->check = CSUM_MANGLED_0;
+ } else {
skb->ip_summed = CHECKSUM_PARTIAL;
skb->csum_start = skb_transport_header(skb) - skb->head;
skb->csum_offset = offsetof(struct udphdr, check);
uh->check = ~udp_v6_check(len, saddr, daddr, 0);
- } else {
- __wsum csum;
-
- BUG_ON(skb->ip_summed == CHECKSUM_PARTIAL);
-
- uh->check = 0;
- csum = skb_checksum(skb, 0, len, 0);
- uh->check = udp_v6_check(len, saddr, daddr, csum);
- if (uh->check == 0)
- uh->check = CSUM_MANGLED_0;
-
- skb->ip_summed = CHECKSUM_UNNECESSARY;
}
}
EXPORT_SYMBOL(udp6_set_csum);
diff --git a/net/ipv6/ip6_fib.c b/net/ipv6/ip6_fib.c
index 0c7e276c230e..ea071fad67a0 100644
--- a/net/ipv6/ip6_fib.c
+++ b/net/ipv6/ip6_fib.c
@@ -55,8 +55,6 @@ struct fib6_cleaner {
void *arg;
};
-static DEFINE_RWLOCK(fib6_walker_lock);
-
#ifdef CONFIG_IPV6_SUBTREES
#define FWS_INIT FWS_S
#else
@@ -66,7 +64,7 @@ static DEFINE_RWLOCK(fib6_walker_lock);
static void fib6_prune_clones(struct net *net, struct fib6_node *fn);
static struct rt6_info *fib6_find_prefix(struct net *net, struct fib6_node *fn);
static struct fib6_node *fib6_repair_tree(struct net *net, struct fib6_node *fn);
-static int fib6_walk(struct fib6_walker *w);
+static int fib6_walk(struct net *net, struct fib6_walker *w);
static int fib6_walk_continue(struct fib6_walker *w);
/*
@@ -78,21 +76,21 @@ static int fib6_walk_continue(struct fib6_walker *w);
static void fib6_gc_timer_cb(unsigned long arg);
-static LIST_HEAD(fib6_walkers);
-#define FOR_WALKERS(w) list_for_each_entry(w, &fib6_walkers, lh)
+#define FOR_WALKERS(net, w) \
+ list_for_each_entry(w, &(net)->ipv6.fib6_walkers, lh)
-static void fib6_walker_link(struct fib6_walker *w)
+static void fib6_walker_link(struct net *net, struct fib6_walker *w)
{
- write_lock_bh(&fib6_walker_lock);
- list_add(&w->lh, &fib6_walkers);
- write_unlock_bh(&fib6_walker_lock);
+ write_lock_bh(&net->ipv6.fib6_walker_lock);
+ list_add(&w->lh, &net->ipv6.fib6_walkers);
+ write_unlock_bh(&net->ipv6.fib6_walker_lock);
}
-static void fib6_walker_unlink(struct fib6_walker *w)
+static void fib6_walker_unlink(struct net *net, struct fib6_walker *w)
{
- write_lock_bh(&fib6_walker_lock);
+ write_lock_bh(&net->ipv6.fib6_walker_lock);
list_del(&w->lh);
- write_unlock_bh(&fib6_walker_lock);
+ write_unlock_bh(&net->ipv6.fib6_walker_lock);
}
static int fib6_new_sernum(struct net *net)
@@ -325,12 +323,13 @@ static int fib6_dump_node(struct fib6_walker *w)
static void fib6_dump_end(struct netlink_callback *cb)
{
+ struct net *net = sock_net(cb->skb->sk);
struct fib6_walker *w = (void *)cb->args[2];
if (w) {
if (cb->args[4]) {
cb->args[4] = 0;
- fib6_walker_unlink(w);
+ fib6_walker_unlink(net, w);
}
cb->args[2] = 0;
kfree(w);
@@ -348,6 +347,7 @@ static int fib6_dump_done(struct netlink_callback *cb)
static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
struct netlink_callback *cb)
{
+ struct net *net = sock_net(skb->sk);
struct fib6_walker *w;
int res;
@@ -359,7 +359,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
w->skip = 0;
read_lock_bh(&table->tb6_lock);
- res = fib6_walk(w);
+ res = fib6_walk(net, w);
read_unlock_bh(&table->tb6_lock);
if (res > 0) {
cb->args[4] = 1;
@@ -379,7 +379,7 @@ static int fib6_dump_table(struct fib6_table *table, struct sk_buff *skb,
res = fib6_walk_continue(w);
read_unlock_bh(&table->tb6_lock);
if (res <= 0) {
- fib6_walker_unlink(w);
+ fib6_walker_unlink(net, w);
cb->args[4] = 0;
}
}
@@ -1340,8 +1340,8 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
}
#endif
- read_lock(&fib6_walker_lock);
- FOR_WALKERS(w) {
+ read_lock(&net->ipv6.fib6_walker_lock);
+ FOR_WALKERS(net, w) {
if (!child) {
if (w->root == fn) {
w->root = w->node = NULL;
@@ -1368,7 +1368,7 @@ static struct fib6_node *fib6_repair_tree(struct net *net,
}
}
}
- read_unlock(&fib6_walker_lock);
+ read_unlock(&net->ipv6.fib6_walker_lock);
node_free(fn);
if (pn->fn_flags & RTN_RTINFO || FIB6_SUBTREE(pn))
@@ -1411,8 +1411,8 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
}
/* Adjust walkers */
- read_lock(&fib6_walker_lock);
- FOR_WALKERS(w) {
+ read_lock(&net->ipv6.fib6_walker_lock);
+ FOR_WALKERS(net, w) {
if (w->state == FWS_C && w->leaf == rt) {
RT6_TRACE("walker %p adjusted by delroute\n", w);
w->leaf = rt->dst.rt6_next;
@@ -1420,7 +1420,7 @@ static void fib6_del_route(struct fib6_node *fn, struct rt6_info **rtp,
w->state = FWS_U;
}
}
- read_unlock(&fib6_walker_lock);
+ read_unlock(&net->ipv6.fib6_walker_lock);
rt->dst.rt6_next = NULL;
@@ -1588,17 +1588,17 @@ skip:
}
}
-static int fib6_walk(struct fib6_walker *w)
+static int fib6_walk(struct net *net, struct fib6_walker *w)
{
int res;
w->state = FWS_INIT;
w->node = w->root;
- fib6_walker_link(w);
+ fib6_walker_link(net, w);
res = fib6_walk_continue(w);
if (res <= 0)
- fib6_walker_unlink(w);
+ fib6_walker_unlink(net, w);
return res;
}
@@ -1668,7 +1668,7 @@ static void fib6_clean_tree(struct net *net, struct fib6_node *root,
c.arg = arg;
c.net = net;
- fib6_walk(&c.w);
+ fib6_walk(net, &c.w);
}
static void __fib6_clean_all(struct net *net,
@@ -1725,14 +1725,15 @@ static void fib6_flush_trees(struct net *net)
* Garbage collection
*/
-static struct fib6_gc_args
+struct fib6_gc_args
{
int timeout;
int more;
-} gc_args;
+};
static int fib6_age(struct rt6_info *rt, void *arg)
{
+ struct fib6_gc_args *gc_args = arg;
unsigned long now = jiffies;
/*
@@ -1748,10 +1749,10 @@ static int fib6_age(struct rt6_info *rt, void *arg)
RT6_TRACE("expiring %p\n", rt);
return -1;
}
- gc_args.more++;
+ gc_args->more++;
} else if (rt->rt6i_flags & RTF_CACHE) {
if (atomic_read(&rt->dst.__refcnt) == 0 &&
- time_after_eq(now, rt->dst.lastuse + gc_args.timeout)) {
+ time_after_eq(now, rt->dst.lastuse + gc_args->timeout)) {
RT6_TRACE("aging clone %p\n", rt);
return -1;
} else if (rt->rt6i_flags & RTF_GATEWAY) {
@@ -1769,21 +1770,20 @@ static int fib6_age(struct rt6_info *rt, void *arg)
return -1;
}
}
- gc_args.more++;
+ gc_args->more++;
}
return 0;
}
-static DEFINE_SPINLOCK(fib6_gc_lock);
-
void fib6_run_gc(unsigned long expires, struct net *net, bool force)
{
+ struct fib6_gc_args gc_args;
unsigned long now;
if (force) {
- spin_lock_bh(&fib6_gc_lock);
- } else if (!spin_trylock_bh(&fib6_gc_lock)) {
+ spin_lock_bh(&net->ipv6.fib6_gc_lock);
+ } else if (!spin_trylock_bh(&net->ipv6.fib6_gc_lock)) {
mod_timer(&net->ipv6.ip6_fib_timer, jiffies + HZ);
return;
}
@@ -1792,7 +1792,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force)
gc_args.more = icmp6_dst_gc();
- fib6_clean_all(net, fib6_age, NULL);
+ fib6_clean_all(net, fib6_age, &gc_args);
now = jiffies;
net->ipv6.ip6_rt_last_gc = now;
@@ -1802,7 +1802,7 @@ void fib6_run_gc(unsigned long expires, struct net *net, bool force)
+ net->ipv6.sysctl.ip6_rt_gc_interval));
else
del_timer(&net->ipv6.ip6_fib_timer);
- spin_unlock_bh(&fib6_gc_lock);
+ spin_unlock_bh(&net->ipv6.fib6_gc_lock);
}
static void fib6_gc_timer_cb(unsigned long arg)
@@ -1814,6 +1814,9 @@ static int __net_init fib6_net_init(struct net *net)
{
size_t size = sizeof(struct hlist_head) * FIB6_TABLE_HASHSZ;
+ spin_lock_init(&net->ipv6.fib6_gc_lock);
+ rwlock_init(&net->ipv6.fib6_walker_lock);
+ INIT_LIST_HEAD(&net->ipv6.fib6_walkers);
setup_timer(&net->ipv6.ip6_fib_timer, fib6_gc_timer_cb, (unsigned long)net);
net->ipv6.rt6_stats = kzalloc(sizeof(*net->ipv6.rt6_stats), GFP_KERNEL);
@@ -1974,7 +1977,8 @@ static int ipv6_route_yield(struct fib6_walker *w)
return 0;
}
-static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter)
+static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter,
+ struct net *net)
{
memset(&iter->w, 0, sizeof(iter->w));
iter->w.func = ipv6_route_yield;
@@ -1984,7 +1988,7 @@ static void ipv6_route_seq_setup_walk(struct ipv6_route_iter *iter)
iter->w.args = iter;
iter->sernum = iter->w.root->fn_sernum;
INIT_LIST_HEAD(&iter->w.lh);
- fib6_walker_link(&iter->w);
+ fib6_walker_link(net, &iter->w);
}
static struct fib6_table *ipv6_route_seq_next_table(struct fib6_table *tbl,
@@ -2045,16 +2049,16 @@ iter_table:
++*pos;
return iter->w.leaf;
} else if (r < 0) {
- fib6_walker_unlink(&iter->w);
+ fib6_walker_unlink(net, &iter->w);
return NULL;
}
- fib6_walker_unlink(&iter->w);
+ fib6_walker_unlink(net, &iter->w);
iter->tbl = ipv6_route_seq_next_table(iter->tbl, net);
if (!iter->tbl)
return NULL;
- ipv6_route_seq_setup_walk(iter);
+ ipv6_route_seq_setup_walk(iter, net);
goto iter_table;
}
@@ -2069,7 +2073,7 @@ static void *ipv6_route_seq_start(struct seq_file *seq, loff_t *pos)
iter->skip = *pos;
if (iter->tbl) {
- ipv6_route_seq_setup_walk(iter);
+ ipv6_route_seq_setup_walk(iter, net);
return ipv6_route_seq_next(seq, NULL, pos);
} else {
return NULL;
@@ -2085,10 +2089,11 @@ static bool ipv6_route_iter_active(struct ipv6_route_iter *iter)
static void ipv6_route_seq_stop(struct seq_file *seq, void *v)
__releases(RCU_BH)
{
+ struct net *net = seq_file_net(seq);
struct ipv6_route_iter *iter = seq->private;
if (ipv6_route_iter_active(iter))
- fib6_walker_unlink(&iter->w);
+ fib6_walker_unlink(net, &iter->w);
rcu_read_unlock_bh();
}
diff --git a/net/ipv6/ip6_flowlabel.c b/net/ipv6/ip6_flowlabel.c
index 1f9ebe3cbb4a..dc2db4f7b182 100644
--- a/net/ipv6/ip6_flowlabel.c
+++ b/net/ipv6/ip6_flowlabel.c
@@ -540,12 +540,13 @@ int ipv6_flowlabel_opt(struct sock *sk, char __user *optval, int optlen)
}
spin_lock_bh(&ip6_sk_fl_lock);
for (sflp = &np->ipv6_fl_list;
- (sfl = rcu_dereference(*sflp)) != NULL;
+ (sfl = rcu_dereference_protected(*sflp,
+ lockdep_is_held(&ip6_sk_fl_lock))) != NULL;
sflp = &sfl->next) {
if (sfl->fl->label == freq.flr_label) {
if (freq.flr_label == (np->flow_label&IPV6_FLOWLABEL_MASK))
np->flow_label &= ~IPV6_FLOWLABEL_MASK;
- *sflp = rcu_dereference(sfl->next);
+ *sflp = sfl->next;
spin_unlock_bh(&ip6_sk_fl_lock);
fl_release(sfl->fl);
kfree_rcu(sfl, rcu);
diff --git a/net/ipv6/ip6_gre.c b/net/ipv6/ip6_gre.c
index 3c7b9310b33f..4e636e60a360 100644
--- a/net/ipv6/ip6_gre.c
+++ b/net/ipv6/ip6_gre.c
@@ -24,7 +24,6 @@
#include <linux/tcp.h>
#include <linux/udp.h>
#include <linux/if_arp.h>
-#include <linux/mroute.h>
#include <linux/init.h>
#include <linux/in6.h>
#include <linux/inetdevice.h>
@@ -361,7 +360,7 @@ static void ip6gre_tunnel_uninit(struct net_device *dev)
struct ip6gre_net *ign = net_generic(t->net, ip6gre_net_id);
ip6gre_tunnel_unlink(ign, t);
- ip6_tnl_dst_reset(t);
+ dst_cache_reset(&t->dst_cache);
dev_put(dev);
}
@@ -634,7 +633,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
}
if (!fl6->flowi6_mark)
- dst = ip6_tnl_dst_get(tunnel);
+ dst = dst_cache_get(&tunnel->dst_cache);
if (!dst) {
dst = ip6_route_output(net, NULL, fl6);
@@ -703,7 +702,7 @@ static netdev_tx_t ip6gre_xmit2(struct sk_buff *skb,
}
if (!fl6->flowi6_mark && ndst)
- ip6_tnl_dst_set(tunnel, ndst);
+ dst_cache_set_ip6(&tunnel->dst_cache, ndst, &fl6->saddr);
skb_dst_set(skb, dst);
proto = NEXTHDR_GRE;
@@ -778,6 +777,8 @@ static inline int ip6gre_xmit_ipv4(struct sk_buff *skb, struct net_device *dev)
__u32 mtu;
int err;
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
encap_limit = t->parms.encap_limit;
@@ -1010,7 +1011,7 @@ static int ip6gre_tnl_change(struct ip6_tnl *t,
t->parms.o_key = p->o_key;
t->parms.i_flags = p->i_flags;
t->parms.o_flags = p->o_flags;
- ip6_tnl_dst_reset(t);
+ dst_cache_reset(&t->dst_cache);
ip6gre_tnl_link_config(t, set_mtu);
return 0;
}
@@ -1220,7 +1221,7 @@ static void ip6gre_dev_free(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- ip6_tnl_dst_destroy(t);
+ dst_cache_destroy(&t->dst_cache);
free_percpu(dev->tstats);
free_netdev(dev);
}
@@ -1258,7 +1259,7 @@ static int ip6gre_tunnel_init_common(struct net_device *dev)
if (!dev->tstats)
return -ENOMEM;
- ret = ip6_tnl_dst_init(tunnel);
+ ret = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
if (ret) {
free_percpu(dev->tstats);
dev->tstats = NULL;
@@ -1513,6 +1514,7 @@ static void ip6gre_tap_setup(struct net_device *dev)
dev->destructor = ip6gre_dev_free;
dev->features |= NETIF_F_NETNS_LOCAL;
+ dev->priv_flags &= ~IFF_TX_SKB_SHARING;
}
static int ip6gre_newlink(struct net *src_net, struct net_device *dev,
@@ -1571,13 +1573,11 @@ static int ip6gre_changelink(struct net_device *dev, struct nlattr *tb[],
return -EEXIST;
} else {
t = nt;
-
- ip6gre_tunnel_unlink(ign, t);
- ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]);
- ip6gre_tunnel_link(ign, t);
- netdev_state_change(dev);
}
+ ip6gre_tunnel_unlink(ign, t);
+ ip6gre_tnl_change(t, &p, !tb[IFLA_MTU]);
+ ip6gre_tunnel_link(ign, t);
return 0;
}
diff --git a/net/ipv6/ip6_input.c b/net/ipv6/ip6_input.c
index 9075acf081dd..c05c425c2389 100644
--- a/net/ipv6/ip6_input.c
+++ b/net/ipv6/ip6_input.c
@@ -49,7 +49,7 @@
int ip6_rcv_finish(struct net *net, struct sock *sk, struct sk_buff *skb)
{
- if (sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
+ if (net->ipv4.sysctl_ip_early_demux && !skb_dst(skb) && skb->sk == NULL) {
const struct inet6_protocol *ipprot;
ipprot = rcu_dereference(inet6_protos[ipv6_hdr(skb)->nexthdr]);
@@ -134,6 +134,16 @@ int ipv6_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt
IPV6_ADDR_MC_SCOPE(&hdr->daddr) == 1)
goto err;
+ /* If enabled, drop unicast packets that were encapsulated in link-layer
+ * multicast or broadcast to protected against the so-called "hole-196"
+ * attack in 802.11 wireless.
+ */
+ if (!ipv6_addr_is_multicast(&hdr->daddr) &&
+ (skb->pkt_type == PACKET_BROADCAST ||
+ skb->pkt_type == PACKET_MULTICAST) &&
+ idev->cnf.drop_unicast_in_l2_multicast)
+ goto err;
+
/* RFC4291 2.7
* Nodes must not originate a packet to a multicast address whose scope
* field contains the reserved value 0; if such a packet is received, it
diff --git a/net/ipv6/ip6_offload.c b/net/ipv6/ip6_offload.c
index eeca943f12dc..82e9f3076028 100644
--- a/net/ipv6/ip6_offload.c
+++ b/net/ipv6/ip6_offload.c
@@ -258,6 +258,19 @@ out:
return pp;
}
+static struct sk_buff **sit_gro_receive(struct sk_buff **head,
+ struct sk_buff *skb)
+{
+ if (NAPI_GRO_CB(skb)->encap_mark) {
+ NAPI_GRO_CB(skb)->flush = 1;
+ return NULL;
+ }
+
+ NAPI_GRO_CB(skb)->encap_mark = 1;
+
+ return ipv6_gro_receive(head, skb);
+}
+
static int ipv6_gro_complete(struct sk_buff *skb, int nhoff)
{
const struct net_offload *ops;
@@ -302,7 +315,7 @@ static struct packet_offload ipv6_packet_offload __read_mostly = {
static const struct net_offload sit_offload = {
.callbacks = {
.gso_segment = ipv6_gso_segment,
- .gro_receive = ipv6_gro_receive,
+ .gro_receive = sit_gro_receive,
.gro_complete = sit_gro_complete,
},
};
diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
index e6a7bd15b9b7..bc972e7152c7 100644
--- a/net/ipv6/ip6_output.c
+++ b/net/ipv6/ip6_output.c
@@ -332,7 +332,6 @@ static int ip6_forward_proxy_check(struct sk_buff *skb)
static inline int ip6_forward_finish(struct net *net, struct sock *sk,
struct sk_buff *skb)
{
- skb_sender_cpu_clear(skb);
return dst_output(net, sk, skb);
}
@@ -909,6 +908,7 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
struct rt6_info *rt;
#endif
int err;
+ int flags = 0;
/* The correct way to handle this would be to do
* ip6_route_get_saddr, and then ip6_route_output; however,
@@ -940,10 +940,13 @@ static int ip6_dst_lookup_tail(struct net *net, const struct sock *sk,
dst_release(*dst);
*dst = NULL;
}
+
+ if (fl6->flowi6_oif)
+ flags |= RT6_LOOKUP_F_IFACE;
}
if (!*dst)
- *dst = ip6_route_output(net, sk, fl6);
+ *dst = ip6_route_output_flags(net, sk, fl6, flags);
err = (*dst)->error;
if (err)
@@ -1087,8 +1090,8 @@ static inline int ip6_ufo_append_data(struct sock *sk,
int getfrag(void *from, char *to, int offset, int len,
int odd, struct sk_buff *skb),
void *from, int length, int hh_len, int fragheaderlen,
- int transhdrlen, int mtu, unsigned int flags,
- const struct flowi6 *fl6)
+ int exthdrlen, int transhdrlen, int mtu,
+ unsigned int flags, const struct flowi6 *fl6)
{
struct sk_buff *skb;
@@ -1113,7 +1116,7 @@ static inline int ip6_ufo_append_data(struct sock *sk,
skb_put(skb, fragheaderlen + transhdrlen);
/* initialize network header pointer */
- skb_reset_network_header(skb);
+ skb_set_network_header(skb, exthdrlen);
/* initialize protocol header pointer */
skb->transport_header = skb->network_header + fragheaderlen;
@@ -1322,7 +1325,7 @@ emsgsize:
headersize == sizeof(struct ipv6hdr) &&
length < mtu - headersize &&
!(flags & MSG_MORE) &&
- rt->dst.dev->features & NETIF_F_V6_CSUM)
+ rt->dst.dev->features & (NETIF_F_IPV6_CSUM | NETIF_F_HW_CSUM))
csummode = CHECKSUM_PARTIAL;
if (sk->sk_type == SOCK_DGRAM || sk->sk_type == SOCK_RAW) {
@@ -1353,9 +1356,9 @@ emsgsize:
(skb && skb_is_gso(skb))) &&
(sk->sk_protocol == IPPROTO_UDP) &&
(rt->dst.dev->features & NETIF_F_UFO) &&
- (sk->sk_type == SOCK_DGRAM)) {
+ (sk->sk_type == SOCK_DGRAM) && !udp_get_no_check6_tx(sk)) {
err = ip6_ufo_append_data(sk, queue, getfrag, from, length,
- hh_len, fragheaderlen,
+ hh_len, fragheaderlen, exthdrlen,
transhdrlen, mtu, flags, fl6);
if (err)
goto error;
diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
index eabffbb89795..1f20345cbc97 100644
--- a/net/ipv6/ip6_tunnel.c
+++ b/net/ipv6/ip6_tunnel.c
@@ -122,97 +122,6 @@ static struct net_device_stats *ip6_get_stats(struct net_device *dev)
return &dev->stats;
}
-/*
- * Locking : hash tables are protected by RCU and RTNL
- */
-
-static void ip6_tnl_per_cpu_dst_set(struct ip6_tnl_dst *idst,
- struct dst_entry *dst)
-{
- write_seqlock_bh(&idst->lock);
- dst_release(rcu_dereference_protected(
- idst->dst,
- lockdep_is_held(&idst->lock.lock)));
- if (dst) {
- dst_hold(dst);
- idst->cookie = rt6_get_cookie((struct rt6_info *)dst);
- } else {
- idst->cookie = 0;
- }
- rcu_assign_pointer(idst->dst, dst);
- write_sequnlock_bh(&idst->lock);
-}
-
-struct dst_entry *ip6_tnl_dst_get(struct ip6_tnl *t)
-{
- struct ip6_tnl_dst *idst;
- struct dst_entry *dst;
- unsigned int seq;
- u32 cookie;
-
- idst = raw_cpu_ptr(t->dst_cache);
-
- rcu_read_lock();
- do {
- seq = read_seqbegin(&idst->lock);
- dst = rcu_dereference(idst->dst);
- cookie = idst->cookie;
- } while (read_seqretry(&idst->lock, seq));
-
- if (dst && !atomic_inc_not_zero(&dst->__refcnt))
- dst = NULL;
- rcu_read_unlock();
-
- if (dst && dst->obsolete && !dst->ops->check(dst, cookie)) {
- ip6_tnl_per_cpu_dst_set(idst, NULL);
- dst_release(dst);
- dst = NULL;
- }
- return dst;
-}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_get);
-
-void ip6_tnl_dst_reset(struct ip6_tnl *t)
-{
- int i;
-
- for_each_possible_cpu(i)
- ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), NULL);
-}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_reset);
-
-void ip6_tnl_dst_set(struct ip6_tnl *t, struct dst_entry *dst)
-{
- ip6_tnl_per_cpu_dst_set(raw_cpu_ptr(t->dst_cache), dst);
-
-}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_set);
-
-void ip6_tnl_dst_destroy(struct ip6_tnl *t)
-{
- if (!t->dst_cache)
- return;
-
- ip6_tnl_dst_reset(t);
- free_percpu(t->dst_cache);
-}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_destroy);
-
-int ip6_tnl_dst_init(struct ip6_tnl *t)
-{
- int i;
-
- t->dst_cache = alloc_percpu(struct ip6_tnl_dst);
- if (!t->dst_cache)
- return -ENOMEM;
-
- for_each_possible_cpu(i)
- seqlock_init(&per_cpu_ptr(t->dst_cache, i)->lock);
-
- return 0;
-}
-EXPORT_SYMBOL_GPL(ip6_tnl_dst_init);
-
/**
* ip6_tnl_lookup - fetch tunnel matching the end-point addresses
* @remote: the address of the tunnel exit-point
@@ -329,7 +238,7 @@ static void ip6_dev_free(struct net_device *dev)
{
struct ip6_tnl *t = netdev_priv(dev);
- ip6_tnl_dst_destroy(t);
+ dst_cache_destroy(&t->dst_cache);
free_percpu(dev->tstats);
free_netdev(dev);
}
@@ -343,12 +252,12 @@ static int ip6_tnl_create2(struct net_device *dev)
t = netdev_priv(dev);
+ dev->rtnl_link_ops = &ip6_link_ops;
err = register_netdevice(dev);
if (err < 0)
goto out;
strcpy(t->parms.name, dev->name);
- dev->rtnl_link_ops = &ip6_link_ops;
dev_hold(dev);
ip6_tnl_link(ip6n, t);
@@ -462,7 +371,7 @@ ip6_tnl_dev_uninit(struct net_device *dev)
RCU_INIT_POINTER(ip6n->tnls_wc[0], NULL);
else
ip6_tnl_unlink(ip6n, t);
- ip6_tnl_dst_reset(t);
+ dst_cache_reset(&t->dst_cache);
dev_put(dev);
}
@@ -1069,7 +978,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
memcpy(&fl6->daddr, addr6, sizeof(fl6->daddr));
neigh_release(neigh);
} else if (!fl6->flowi6_mark)
- dst = ip6_tnl_dst_get(t);
+ dst = dst_cache_get(&t->dst_cache);
if (!ip6_tnl_xmit_ctl(t, &fl6->saddr, &fl6->daddr))
goto tx_err_link_failure;
@@ -1133,7 +1042,7 @@ static int ip6_tnl_xmit2(struct sk_buff *skb,
}
if (!fl6->flowi6_mark && ndst)
- ip6_tnl_dst_set(t, ndst);
+ dst_cache_set_ip6(&t->dst_cache, ndst, &fl6->saddr);
skb_dst_set(skb, dst);
skb->transport_header = skb->network_header;
@@ -1180,6 +1089,8 @@ ip4ip6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
u8 tproto;
int err;
+ memset(&(IPCB(skb)->opt), 0, sizeof(IPCB(skb)->opt));
+
tproto = ACCESS_ONCE(t->parms.proto);
if (tproto != IPPROTO_IPIP && tproto != 0)
return -1;
@@ -1366,7 +1277,7 @@ ip6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
t->parms.flowinfo = p->flowinfo;
t->parms.link = p->link;
t->parms.proto = p->proto;
- ip6_tnl_dst_reset(t);
+ dst_cache_reset(&t->dst_cache);
ip6_tnl_link_config(t);
return 0;
}
@@ -1637,7 +1548,7 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
if (!dev->tstats)
return -ENOMEM;
- ret = ip6_tnl_dst_init(t);
+ ret = dst_cache_init(&t->dst_cache, GFP_KERNEL);
if (ret) {
free_percpu(dev->tstats);
dev->tstats = NULL;
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index 14dacf1df529..a7520528ecd2 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -73,8 +73,8 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
struct sk_buff *skb,
struct net_device *dev, struct in6_addr *saddr,
struct in6_addr *daddr,
- __u8 prio, __u8 ttl, __be16 src_port,
- __be16 dst_port, bool nocheck)
+ __u8 prio, __u8 ttl, __be32 label,
+ __be16 src_port, __be16 dst_port, bool nocheck)
{
struct udphdr *uh;
struct ipv6hdr *ip6h;
@@ -98,7 +98,7 @@ int udp_tunnel6_xmit_skb(struct dst_entry *dst, struct sock *sk,
__skb_push(skb, sizeof(*ip6h));
skb_reset_network_header(skb);
ip6h = ipv6_hdr(skb);
- ip6_flow_hdr(ip6h, prio, htonl(0));
+ ip6_flow_hdr(ip6h, prio, label);
ip6h->payload_len = htons(skb->len);
ip6h->nexthdr = IPPROTO_UDP;
ip6h->hop_limit = ttl;
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 0a8610b33d79..d90a11f14040 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -640,7 +640,7 @@ vti6_tnl_change(struct ip6_tnl *t, const struct __ip6_tnl_parm *p)
t->parms.i_key = p->i_key;
t->parms.o_key = p->o_key;
t->parms.proto = p->proto;
- ip6_tnl_dst_reset(t);
+ dst_cache_reset(&t->dst_cache);
vti6_link_config(t);
return 0;
}
diff --git a/net/ipv6/ip6mr.c b/net/ipv6/ip6mr.c
index ad19136086dd..a10e77103c88 100644
--- a/net/ipv6/ip6mr.c
+++ b/net/ipv6/ip6mr.c
@@ -118,7 +118,7 @@ static void mr6_netlink_event(struct mr6_table *mrt, struct mfc6_cache *mfc,
int cmd);
static int ip6mr_rtm_dumproute(struct sk_buff *skb,
struct netlink_callback *cb);
-static void mroute_clean_tables(struct mr6_table *mrt);
+static void mroute_clean_tables(struct mr6_table *mrt, bool all);
static void ipmr_expire_process(unsigned long arg);
#ifdef CONFIG_IPV6_MROUTE_MULTIPLE_TABLES
@@ -334,7 +334,7 @@ static struct mr6_table *ip6mr_new_table(struct net *net, u32 id)
static void ip6mr_free_table(struct mr6_table *mrt)
{
del_timer_sync(&mrt->ipmr_expire_timer);
- mroute_clean_tables(mrt);
+ mroute_clean_tables(mrt, true);
kfree(mrt);
}
@@ -765,10 +765,6 @@ static struct net_device *ip6mr_reg_vif(struct net *net, struct mr6_table *mrt)
return dev;
failure:
- /* allow the register to be completed before unregistering. */
- rtnl_unlock();
- rtnl_lock();
-
unregister_netdevice(dev);
return NULL;
}
@@ -1542,7 +1538,7 @@ static int ip6mr_mfc_add(struct net *net, struct mr6_table *mrt,
* Close the multicast socket, and clear the vif tables etc
*/
-static void mroute_clean_tables(struct mr6_table *mrt)
+static void mroute_clean_tables(struct mr6_table *mrt, bool all)
{
int i;
LIST_HEAD(list);
@@ -1552,8 +1548,9 @@ static void mroute_clean_tables(struct mr6_table *mrt)
* Shut down all active vif entries
*/
for (i = 0; i < mrt->maxvif; i++) {
- if (!(mrt->vif6_table[i].flags & VIFF_STATIC))
- mif6_delete(mrt, i, &list);
+ if (!all && (mrt->vif6_table[i].flags & VIFF_STATIC))
+ continue;
+ mif6_delete(mrt, i, &list);
}
unregister_netdevice_many(&list);
@@ -1562,7 +1559,7 @@ static void mroute_clean_tables(struct mr6_table *mrt)
*/
for (i = 0; i < MFC6_LINES; i++) {
list_for_each_entry_safe(c, next, &mrt->mfc6_cache_array[i], list) {
- if (c->mfc_flags & MFC_STATIC)
+ if (!all && (c->mfc_flags & MFC_STATIC))
continue;
write_lock_bh(&mrt_lock);
list_del(&c->list);
@@ -1625,7 +1622,7 @@ int ip6mr_sk_done(struct sock *sk)
net->ipv6.devconf_all);
write_unlock_bh(&mrt_lock);
- mroute_clean_tables(mrt);
+ mroute_clean_tables(mrt, false);
err = 0;
break;
}
diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 63e6956917c9..4449ad1f8114 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -111,7 +111,8 @@ struct ipv6_txoptions *ipv6_update_options(struct sock *sk,
icsk->icsk_sync_mss(sk, icsk->icsk_pmtu_cookie);
}
}
- opt = xchg(&inet6_sk(sk)->opt, opt);
+ opt = xchg((__force struct ipv6_txoptions **)&inet6_sk(sk)->opt,
+ opt);
sk_dst_reset(sk);
return opt;
@@ -231,9 +232,12 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
sk->sk_socket->ops = &inet_dgram_ops;
sk->sk_family = PF_INET;
}
- opt = xchg(&np->opt, NULL);
- if (opt)
- sock_kfree_s(sk, opt, opt->tot_len);
+ opt = xchg((__force struct ipv6_txoptions **)&np->opt,
+ NULL);
+ if (opt) {
+ atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+ txopt_put(opt);
+ }
pktopt = xchg(&np->pktoptions, NULL);
kfree_skb(pktopt);
@@ -403,7 +407,8 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
if (optname != IPV6_RTHDR && !ns_capable(net->user_ns, CAP_NET_RAW))
break;
- opt = ipv6_renew_options(sk, np->opt, optname,
+ opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+ opt = ipv6_renew_options(sk, opt, optname,
(struct ipv6_opt_hdr __user *)optval,
optlen);
if (IS_ERR(opt)) {
@@ -432,8 +437,10 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, int optname,
retv = 0;
opt = ipv6_update_options(sk, opt);
sticky_done:
- if (opt)
- sock_kfree_s(sk, opt, opt->tot_len);
+ if (opt) {
+ atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+ txopt_put(opt);
+ }
break;
}
@@ -486,6 +493,7 @@ sticky_done:
break;
memset(opt, 0, sizeof(*opt));
+ atomic_set(&opt->refcnt, 1);
opt->tot_len = sizeof(*opt) + optlen;
retv = -EFAULT;
if (copy_from_user(opt+1, optval, optlen))
@@ -502,8 +510,10 @@ update:
retv = 0;
opt = ipv6_update_options(sk, opt);
done:
- if (opt)
- sock_kfree_s(sk, opt, opt->tot_len);
+ if (opt) {
+ atomic_sub(opt->tot_len, &sk->sk_omem_alloc);
+ txopt_put(opt);
+ }
break;
}
case IPV6_UNICAST_HOPS:
@@ -1110,10 +1120,11 @@ static int do_ipv6_getsockopt(struct sock *sk, int level, int optname,
case IPV6_RTHDR:
case IPV6_DSTOPTS:
{
+ struct ipv6_txoptions *opt;
lock_sock(sk);
- len = ipv6_getsockopt_sticky(sk, np->opt,
- optname, optval, len);
+ opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+ len = ipv6_getsockopt_sticky(sk, opt, optname, optval, len);
release_sock(sk);
/* check if ipv6_getsockopt_sticky() returns err code */
if (len < 0)
diff --git a/net/ipv6/mcast.c b/net/ipv6/mcast.c
index 124338a39e29..d64ee7e83664 100644
--- a/net/ipv6/mcast.c
+++ b/net/ipv6/mcast.c
@@ -1574,9 +1574,8 @@ static struct sk_buff *mld_newpack(struct inet6_dev *idev, unsigned int mtu)
return NULL;
skb->priority = TC_PRIO_CONTROL;
- skb->reserved_tailroom = skb_end_offset(skb) -
- min(mtu, skb_end_offset(skb));
skb_reserve(skb, hlen);
+ skb_tailroom_reserve(skb, mtu, tlen);
if (__ipv6_get_lladdr(idev, &addr_buf, IFA_F_TENTATIVE)) {
/* <draft-ietf-magma-mld-source-05.txt>:
@@ -1651,7 +1650,6 @@ out:
if (!err) {
ICMP6MSGOUT_INC_STATS(net, idev, ICMPV6_MLD2_REPORT);
ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
- IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, payload_len);
} else {
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
}
@@ -2015,7 +2013,6 @@ out:
if (!err) {
ICMP6MSGOUT_INC_STATS(net, idev, type);
ICMP6_INC_STATS(net, idev, ICMP6_MIB_OUTMSGS);
- IP6_UPD_PO_STATS(net, idev, IPSTATS_MIB_OUTMCAST, full_len);
} else
IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
diff --git a/net/ipv6/ndisc.c b/net/ipv6/ndisc.c
index 3e0f855e1bea..c245895a3d41 100644
--- a/net/ipv6/ndisc.c
+++ b/net/ipv6/ndisc.c
@@ -556,8 +556,7 @@ static void ndisc_send_unsol_na(struct net_device *dev)
}
void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
- const struct in6_addr *daddr, const struct in6_addr *saddr,
- struct sk_buff *oskb)
+ const struct in6_addr *daddr, const struct in6_addr *saddr)
{
struct sk_buff *skb;
struct in6_addr addr_buf;
@@ -593,9 +592,6 @@ void ndisc_send_ns(struct net_device *dev, const struct in6_addr *solicit,
ndisc_fill_addr_option(skb, ND_OPT_SOURCE_LL_ADDR,
dev->dev_addr);
- if (!(dev->priv_flags & IFF_XMIT_DST_RELEASE) && oskb)
- skb_dst_copy(skb, oskb);
-
ndisc_send_skb(skb, daddr, saddr);
}
@@ -682,12 +678,12 @@ static void ndisc_solicit(struct neighbour *neigh, struct sk_buff *skb)
"%s: trying to ucast probe in NUD_INVALID: %pI6\n",
__func__, target);
}
- ndisc_send_ns(dev, target, target, saddr, skb);
+ ndisc_send_ns(dev, target, target, saddr);
} else if ((probes -= NEIGH_VAR(neigh->parms, APP_PROBES)) < 0) {
neigh_app_ns(neigh);
} else {
addrconf_addr_solict_mult(target, &mcaddr);
- ndisc_send_ns(dev, target, &mcaddr, saddr, skb);
+ ndisc_send_ns(dev, target, &mcaddr, saddr);
}
}
@@ -887,6 +883,7 @@ static void ndisc_recv_na(struct sk_buff *skb)
offsetof(struct nd_msg, opt));
struct ndisc_options ndopts;
struct net_device *dev = skb->dev;
+ struct inet6_dev *idev = __in6_dev_get(dev);
struct inet6_ifaddr *ifp;
struct neighbour *neigh;
@@ -906,6 +903,14 @@ static void ndisc_recv_na(struct sk_buff *skb)
return;
}
+ /* For some 802.11 wireless deployments (and possibly other networks),
+ * there will be a NA proxy and unsolicitd packets are attacks
+ * and thus should not be accepted.
+ */
+ if (!msg->icmph.icmp6_solicited && idev &&
+ idev->cnf.drop_unsolicited_na)
+ return;
+
if (!ndisc_parse_options(msg->opt, ndoptlen, &ndopts)) {
ND_PRINTK(2, warn, "NS: invalid ND option\n");
return;
@@ -1187,7 +1192,7 @@ static void ndisc_router_discovery(struct sk_buff *skb)
*/
if (!in6_dev->cnf.accept_ra_from_local &&
ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
- NULL, 0)) {
+ in6_dev->dev, 0)) {
ND_PRINTK(2, info,
"RA from local address detected on dev: %s: default router ignored\n",
skb->dev->name);
@@ -1341,7 +1346,7 @@ skip_linkparms:
#ifdef CONFIG_IPV6_ROUTE_INFO
if (!in6_dev->cnf.accept_ra_from_local &&
ipv6_chk_addr(dev_net(in6_dev->dev), &ipv6_hdr(skb)->saddr,
- NULL, 0)) {
+ in6_dev->dev, 0)) {
ND_PRINTK(2, info,
"RA from local address detected on dev: %s: router info ignored.\n",
skb->dev->name);
diff --git a/net/ipv6/netfilter/Kconfig b/net/ipv6/netfilter/Kconfig
index f6a024e141e5..e10a04c9cdc7 100644
--- a/net/ipv6/netfilter/Kconfig
+++ b/net/ipv6/netfilter/Kconfig
@@ -49,6 +49,7 @@ config NFT_REJECT_IPV6
config NFT_DUP_IPV6
tristate "IPv6 nf_tables packet duplication support"
+ depends on !NF_CONNTRACK || NF_CONNTRACK
select NF_DUP_IPV6
help
This module enables IPv6 packet duplication support for nf_tables.
diff --git a/net/ipv6/netfilter/ip6_tables.c b/net/ipv6/netfilter/ip6_tables.c
index 99425cf2819b..86b67b70b626 100644
--- a/net/ipv6/netfilter/ip6_tables.c
+++ b/net/ipv6/netfilter/ip6_tables.c
@@ -198,11 +198,12 @@ get_entry(const void *base, unsigned int offset)
/* All zeroes == unconditional rule. */
/* Mildly perf critical (only if packet tracing is on) */
-static inline bool unconditional(const struct ip6t_ip6 *ipv6)
+static inline bool unconditional(const struct ip6t_entry *e)
{
static const struct ip6t_ip6 uncond;
- return memcmp(ipv6, &uncond, sizeof(uncond)) == 0;
+ return e->target_offset == sizeof(struct ip6t_entry) &&
+ memcmp(&e->ipv6, &uncond, sizeof(uncond)) == 0;
}
static inline const struct xt_entry_target *
@@ -258,11 +259,10 @@ get_chainname_rulenum(const struct ip6t_entry *s, const struct ip6t_entry *e,
} else if (s == e) {
(*rulenum)++;
- if (s->target_offset == sizeof(struct ip6t_entry) &&
+ if (unconditional(s) &&
strcmp(t->target.u.kernel.target->name,
XT_STANDARD_TARGET) == 0 &&
- t->verdict < 0 &&
- unconditional(&s->ipv6)) {
+ t->verdict < 0) {
/* Tail of chains: STANDARD target (return/policy) */
*comment = *chainname == hookname
? comments[NF_IP6_TRACE_COMMENT_POLICY]
@@ -488,11 +488,10 @@ mark_source_chains(const struct xt_table_info *newinfo,
e->comefrom |= ((1 << hook) | (1 << NF_INET_NUMHOOKS));
/* Unconditional return/END. */
- if ((e->target_offset == sizeof(struct ip6t_entry) &&
+ if ((unconditional(e) &&
(strcmp(t->target.u.user.name,
XT_STANDARD_TARGET) == 0) &&
- t->verdict < 0 &&
- unconditional(&e->ipv6)) || visited) {
+ t->verdict < 0) || visited) {
unsigned int oldpos, size;
if ((strcmp(t->target.u.user.name,
@@ -581,14 +580,12 @@ static void cleanup_match(struct xt_entry_match *m, struct net *net)
}
static int
-check_entry(const struct ip6t_entry *e, const char *name)
+check_entry(const struct ip6t_entry *e)
{
const struct xt_entry_target *t;
- if (!ip6_checkentry(&e->ipv6)) {
- duprintf("ip_tables: ip check failed %p %s.\n", e, name);
+ if (!ip6_checkentry(&e->ipv6))
return -EINVAL;
- }
if (e->target_offset + sizeof(struct xt_entry_target) >
e->next_offset)
@@ -679,10 +676,6 @@ find_check_entry(struct ip6t_entry *e, struct net *net, const char *name,
struct xt_mtchk_param mtpar;
struct xt_entry_match *ematch;
- ret = check_entry(e, name);
- if (ret)
- return ret;
-
e->counters.pcnt = xt_percpu_counter_alloc();
if (IS_ERR_VALUE(e->counters.pcnt))
return -ENOMEM;
@@ -733,7 +726,7 @@ static bool check_underflow(const struct ip6t_entry *e)
const struct xt_entry_target *t;
unsigned int verdict;
- if (!unconditional(&e->ipv6))
+ if (!unconditional(e))
return false;
t = ip6t_get_target_c(e);
if (strcmp(t->u.user.name, XT_STANDARD_TARGET) != 0)
@@ -753,9 +746,11 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
unsigned int valid_hooks)
{
unsigned int h;
+ int err;
if ((unsigned long)e % __alignof__(struct ip6t_entry) != 0 ||
- (unsigned char *)e + sizeof(struct ip6t_entry) >= limit) {
+ (unsigned char *)e + sizeof(struct ip6t_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit) {
duprintf("Bad offset %p\n", e);
return -EINVAL;
}
@@ -767,6 +762,10 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
return -EINVAL;
}
+ err = check_entry(e);
+ if (err)
+ return err;
+
/* Check hooks & underflows */
for (h = 0; h < NF_INET_NUMHOOKS; h++) {
if (!(valid_hooks & (1 << h)))
@@ -775,9 +774,9 @@ check_entry_size_and_hooks(struct ip6t_entry *e,
newinfo->hook_entry[h] = hook_entries[h];
if ((unsigned char *)e - base == underflows[h]) {
if (!check_underflow(e)) {
- pr_err("Underflows must be unconditional and "
- "use the STANDARD target with "
- "ACCEPT/DROP\n");
+ pr_debug("Underflows must be unconditional and "
+ "use the STANDARD target with "
+ "ACCEPT/DROP\n");
return -EINVAL;
}
newinfo->underflow[h] = underflows[h];
@@ -1169,6 +1168,7 @@ get_entries(struct net *net, struct ip6t_get_entries __user *uptr,
*len, sizeof(get) + get.size);
return -EINVAL;
}
+ get.name[sizeof(get.name) - 1] = '\0';
t = xt_find_table_lock(net, AF_INET6, get.name);
if (!IS_ERR_OR_NULL(t)) {
@@ -1505,7 +1505,8 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
duprintf("check_compat_entry_size_and_hooks %p\n", e);
if ((unsigned long)e % __alignof__(struct compat_ip6t_entry) != 0 ||
- (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit) {
+ (unsigned char *)e + sizeof(struct compat_ip6t_entry) >= limit ||
+ (unsigned char *)e + e->next_offset > limit) {
duprintf("Bad offset %p, limit = %p\n", e, limit);
return -EINVAL;
}
@@ -1518,7 +1519,7 @@ check_compat_entry_size_and_hooks(struct compat_ip6t_entry *e,
}
/* For purposes of check_entry casting the compat entry is fine */
- ret = check_entry((struct ip6t_entry *)e, name);
+ ret = check_entry((struct ip6t_entry *)e);
if (ret)
return ret;
@@ -1944,6 +1945,7 @@ compat_get_entries(struct net *net, struct compat_ip6t_get_entries __user *uptr,
*len, sizeof(get) + get.size);
return -EINVAL;
}
+ get.name[sizeof(get.name) - 1] = '\0';
xt_compat_lock(AF_INET6);
t = xt_find_table_lock(net, AF_INET6, get.name);
@@ -2071,9 +2073,28 @@ do_ip6t_get_ctl(struct sock *sk, int cmd, void __user *user, int *len)
return ret;
}
-struct xt_table *ip6t_register_table(struct net *net,
- const struct xt_table *table,
- const struct ip6t_replace *repl)
+static void __ip6t_unregister_table(struct net *net, struct xt_table *table)
+{
+ struct xt_table_info *private;
+ void *loc_cpu_entry;
+ struct module *table_owner = table->me;
+ struct ip6t_entry *iter;
+
+ private = xt_unregister_table(table);
+
+ /* Decrease module usage counts and free resources */
+ loc_cpu_entry = private->entries;
+ xt_entry_foreach(iter, loc_cpu_entry, private->size)
+ cleanup_entry(iter, net);
+ if (private->number > private->initial_entries)
+ module_put(table_owner);
+ xt_free_table_info(private);
+}
+
+int ip6t_register_table(struct net *net, const struct xt_table *table,
+ const struct ip6t_replace *repl,
+ const struct nf_hook_ops *ops,
+ struct xt_table **res)
{
int ret;
struct xt_table_info *newinfo;
@@ -2082,10 +2103,8 @@ struct xt_table *ip6t_register_table(struct net *net,
struct xt_table *new_table;
newinfo = xt_alloc_table_info(repl->size);
- if (!newinfo) {
- ret = -ENOMEM;
- goto out;
- }
+ if (!newinfo)
+ return -ENOMEM;
loc_cpu_entry = newinfo->entries;
memcpy(loc_cpu_entry, repl->entries, repl->size);
@@ -2099,30 +2118,28 @@ struct xt_table *ip6t_register_table(struct net *net,
ret = PTR_ERR(new_table);
goto out_free;
}
- return new_table;
+
+ /* set res now, will see skbs right after nf_register_net_hooks */
+ WRITE_ONCE(*res, new_table);
+
+ ret = nf_register_net_hooks(net, ops, hweight32(table->valid_hooks));
+ if (ret != 0) {
+ __ip6t_unregister_table(net, new_table);
+ *res = NULL;
+ }
+
+ return ret;
out_free:
xt_free_table_info(newinfo);
-out:
- return ERR_PTR(ret);
+ return ret;
}
-void ip6t_unregister_table(struct net *net, struct xt_table *table)
+void ip6t_unregister_table(struct net *net, struct xt_table *table,
+ const struct nf_hook_ops *ops)
{
- struct xt_table_info *private;
- void *loc_cpu_entry;
- struct module *table_owner = table->me;
- struct ip6t_entry *iter;
-
- private = xt_unregister_table(table);
-
- /* Decrease module usage counts and free resources */
- loc_cpu_entry = private->entries;
- xt_entry_foreach(iter, loc_cpu_entry, private->size)
- cleanup_entry(iter, net);
- if (private->number > private->initial_entries)
- module_put(table_owner);
- xt_free_table_info(private);
+ nf_unregister_net_hooks(net, ops, hweight32(table->valid_hooks));
+ __ip6t_unregister_table(net, table);
}
/* Returns 1 if the type and code is matched by the range, 0 otherwise */
diff --git a/net/ipv6/netfilter/ip6table_filter.c b/net/ipv6/netfilter/ip6table_filter.c
index 8b277b983ca5..1343077dde93 100644
--- a/net/ipv6/netfilter/ip6table_filter.c
+++ b/net/ipv6/netfilter/ip6table_filter.c
@@ -22,12 +22,15 @@ MODULE_DESCRIPTION("ip6tables filter table");
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT))
+static int __net_init ip6table_filter_table_init(struct net *net);
+
static const struct xt_table packet_filter = {
.name = "filter",
.valid_hooks = FILTER_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_FILTER,
+ .table_init = ip6table_filter_table_init,
};
/* The work comes in here from netfilter.c. */
@@ -44,9 +47,13 @@ static struct nf_hook_ops *filter_ops __read_mostly;
static bool forward = true;
module_param(forward, bool, 0000);
-static int __net_init ip6table_filter_net_init(struct net *net)
+static int __net_init ip6table_filter_table_init(struct net *net)
{
struct ip6t_replace *repl;
+ int err;
+
+ if (net->ipv6.ip6table_filter)
+ return 0;
repl = ip6t_alloc_initial_table(&packet_filter);
if (repl == NULL)
@@ -55,15 +62,26 @@ static int __net_init ip6table_filter_net_init(struct net *net)
((struct ip6t_standard *)repl->entries)[1].target.verdict =
forward ? -NF_ACCEPT - 1 : -NF_DROP - 1;
- net->ipv6.ip6table_filter =
- ip6t_register_table(net, &packet_filter, repl);
+ err = ip6t_register_table(net, &packet_filter, repl, filter_ops,
+ &net->ipv6.ip6table_filter);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv6.ip6table_filter);
+ return err;
+}
+
+static int __net_init ip6table_filter_net_init(struct net *net)
+{
+ if (net == &init_net || !forward)
+ return ip6table_filter_table_init(net);
+
+ return 0;
}
static void __net_exit ip6table_filter_net_exit(struct net *net)
{
- ip6t_unregister_table(net, net->ipv6.ip6table_filter);
+ if (!net->ipv6.ip6table_filter)
+ return;
+ ip6t_unregister_table(net, net->ipv6.ip6table_filter, filter_ops);
+ net->ipv6.ip6table_filter = NULL;
}
static struct pernet_operations ip6table_filter_net_ops = {
@@ -75,28 +93,21 @@ static int __init ip6table_filter_init(void)
{
int ret;
+ filter_ops = xt_hook_ops_alloc(&packet_filter, ip6table_filter_hook);
+ if (IS_ERR(filter_ops))
+ return PTR_ERR(filter_ops);
+
ret = register_pernet_subsys(&ip6table_filter_net_ops);
if (ret < 0)
- return ret;
-
- /* Register hooks */
- filter_ops = xt_hook_link(&packet_filter, ip6table_filter_hook);
- if (IS_ERR(filter_ops)) {
- ret = PTR_ERR(filter_ops);
- goto cleanup_table;
- }
+ kfree(filter_ops);
return ret;
-
- cleanup_table:
- unregister_pernet_subsys(&ip6table_filter_net_ops);
- return ret;
}
static void __exit ip6table_filter_fini(void)
{
- xt_hook_unlink(&packet_filter, filter_ops);
unregister_pernet_subsys(&ip6table_filter_net_ops);
+ kfree(filter_ops);
}
module_init(ip6table_filter_init);
diff --git a/net/ipv6/netfilter/ip6table_mangle.c b/net/ipv6/netfilter/ip6table_mangle.c
index abe278b07932..cb2b28883252 100644
--- a/net/ipv6/netfilter/ip6table_mangle.c
+++ b/net/ipv6/netfilter/ip6table_mangle.c
@@ -23,12 +23,15 @@ MODULE_DESCRIPTION("ip6tables mangle table");
(1 << NF_INET_LOCAL_OUT) | \
(1 << NF_INET_POST_ROUTING))
+static int __net_init ip6table_mangle_table_init(struct net *net);
+
static const struct xt_table packet_mangler = {
.name = "mangle",
.valid_hooks = MANGLE_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_MANGLE,
+ .table_init = ip6table_mangle_table_init,
};
static unsigned int
@@ -88,26 +91,33 @@ ip6table_mangle_hook(void *priv, struct sk_buff *skb,
}
static struct nf_hook_ops *mangle_ops __read_mostly;
-static int __net_init ip6table_mangle_net_init(struct net *net)
+static int __net_init ip6table_mangle_table_init(struct net *net)
{
struct ip6t_replace *repl;
+ int ret;
+
+ if (net->ipv6.ip6table_mangle)
+ return 0;
repl = ip6t_alloc_initial_table(&packet_mangler);
if (repl == NULL)
return -ENOMEM;
- net->ipv6.ip6table_mangle =
- ip6t_register_table(net, &packet_mangler, repl);
+ ret = ip6t_register_table(net, &packet_mangler, repl, mangle_ops,
+ &net->ipv6.ip6table_mangle);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv6.ip6table_mangle);
+ return ret;
}
static void __net_exit ip6table_mangle_net_exit(struct net *net)
{
- ip6t_unregister_table(net, net->ipv6.ip6table_mangle);
+ if (!net->ipv6.ip6table_mangle)
+ return;
+
+ ip6t_unregister_table(net, net->ipv6.ip6table_mangle, mangle_ops);
+ net->ipv6.ip6table_mangle = NULL;
}
static struct pernet_operations ip6table_mangle_net_ops = {
- .init = ip6table_mangle_net_init,
.exit = ip6table_mangle_net_exit,
};
@@ -115,28 +125,28 @@ static int __init ip6table_mangle_init(void)
{
int ret;
+ mangle_ops = xt_hook_ops_alloc(&packet_mangler, ip6table_mangle_hook);
+ if (IS_ERR(mangle_ops))
+ return PTR_ERR(mangle_ops);
+
ret = register_pernet_subsys(&ip6table_mangle_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(mangle_ops);
return ret;
-
- /* Register hooks */
- mangle_ops = xt_hook_link(&packet_mangler, ip6table_mangle_hook);
- if (IS_ERR(mangle_ops)) {
- ret = PTR_ERR(mangle_ops);
- goto cleanup_table;
}
- return ret;
-
- cleanup_table:
- unregister_pernet_subsys(&ip6table_mangle_net_ops);
+ ret = ip6table_mangle_table_init(&init_net);
+ if (ret) {
+ unregister_pernet_subsys(&ip6table_mangle_net_ops);
+ kfree(mangle_ops);
+ }
return ret;
}
static void __exit ip6table_mangle_fini(void)
{
- xt_hook_unlink(&packet_mangler, mangle_ops);
unregister_pernet_subsys(&ip6table_mangle_net_ops);
+ kfree(mangle_ops);
}
module_init(ip6table_mangle_init);
diff --git a/net/ipv6/netfilter/ip6table_nat.c b/net/ipv6/netfilter/ip6table_nat.c
index de2a10a565f5..7d2bd940291f 100644
--- a/net/ipv6/netfilter/ip6table_nat.c
+++ b/net/ipv6/netfilter/ip6table_nat.c
@@ -20,6 +20,8 @@
#include <net/netfilter/nf_nat_core.h>
#include <net/netfilter/nf_nat_l3proto.h>
+static int __net_init ip6table_nat_table_init(struct net *net);
+
static const struct xt_table nf_nat_ipv6_table = {
.name = "nat",
.valid_hooks = (1 << NF_INET_PRE_ROUTING) |
@@ -28,6 +30,7 @@ static const struct xt_table nf_nat_ipv6_table = {
(1 << NF_INET_LOCAL_IN),
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
+ .table_init = ip6table_nat_table_init,
};
static unsigned int ip6table_nat_do_chain(void *priv,
@@ -97,50 +100,50 @@ static struct nf_hook_ops nf_nat_ipv6_ops[] __read_mostly = {
},
};
-static int __net_init ip6table_nat_net_init(struct net *net)
+static int __net_init ip6table_nat_table_init(struct net *net)
{
struct ip6t_replace *repl;
+ int ret;
+
+ if (net->ipv6.ip6table_nat)
+ return 0;
repl = ip6t_alloc_initial_table(&nf_nat_ipv6_table);
if (repl == NULL)
return -ENOMEM;
- net->ipv6.ip6table_nat = ip6t_register_table(net, &nf_nat_ipv6_table, repl);
+ ret = ip6t_register_table(net, &nf_nat_ipv6_table, repl,
+ nf_nat_ipv6_ops, &net->ipv6.ip6table_nat);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv6.ip6table_nat);
+ return ret;
}
static void __net_exit ip6table_nat_net_exit(struct net *net)
{
- ip6t_unregister_table(net, net->ipv6.ip6table_nat);
+ if (!net->ipv6.ip6table_nat)
+ return;
+ ip6t_unregister_table(net, net->ipv6.ip6table_nat, nf_nat_ipv6_ops);
+ net->ipv6.ip6table_nat = NULL;
}
static struct pernet_operations ip6table_nat_net_ops = {
- .init = ip6table_nat_net_init,
.exit = ip6table_nat_net_exit,
};
static int __init ip6table_nat_init(void)
{
- int err;
+ int ret = register_pernet_subsys(&ip6table_nat_net_ops);
- err = register_pernet_subsys(&ip6table_nat_net_ops);
- if (err < 0)
- goto err1;
+ if (ret)
+ return ret;
- err = nf_register_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
- if (err < 0)
- goto err2;
- return 0;
-
-err2:
- unregister_pernet_subsys(&ip6table_nat_net_ops);
-err1:
- return err;
+ ret = ip6table_nat_table_init(&init_net);
+ if (ret)
+ unregister_pernet_subsys(&ip6table_nat_net_ops);
+ return ret;
}
static void __exit ip6table_nat_exit(void)
{
- nf_unregister_hooks(nf_nat_ipv6_ops, ARRAY_SIZE(nf_nat_ipv6_ops));
unregister_pernet_subsys(&ip6table_nat_net_ops);
}
diff --git a/net/ipv6/netfilter/ip6table_raw.c b/net/ipv6/netfilter/ip6table_raw.c
index 9021963565c3..d4bc56443dc1 100644
--- a/net/ipv6/netfilter/ip6table_raw.c
+++ b/net/ipv6/netfilter/ip6table_raw.c
@@ -9,12 +9,15 @@
#define RAW_VALID_HOOKS ((1 << NF_INET_PRE_ROUTING) | (1 << NF_INET_LOCAL_OUT))
+static int __net_init ip6table_raw_table_init(struct net *net);
+
static const struct xt_table packet_raw = {
.name = "raw",
.valid_hooks = RAW_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_RAW,
+ .table_init = ip6table_raw_table_init,
};
/* The work comes in here from netfilter.c. */
@@ -27,26 +30,32 @@ ip6table_raw_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *rawtable_ops __read_mostly;
-static int __net_init ip6table_raw_net_init(struct net *net)
+static int __net_init ip6table_raw_table_init(struct net *net)
{
struct ip6t_replace *repl;
+ int ret;
+
+ if (net->ipv6.ip6table_raw)
+ return 0;
repl = ip6t_alloc_initial_table(&packet_raw);
if (repl == NULL)
return -ENOMEM;
- net->ipv6.ip6table_raw =
- ip6t_register_table(net, &packet_raw, repl);
+ ret = ip6t_register_table(net, &packet_raw, repl, rawtable_ops,
+ &net->ipv6.ip6table_raw);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv6.ip6table_raw);
+ return ret;
}
static void __net_exit ip6table_raw_net_exit(struct net *net)
{
- ip6t_unregister_table(net, net->ipv6.ip6table_raw);
+ if (!net->ipv6.ip6table_raw)
+ return;
+ ip6t_unregister_table(net, net->ipv6.ip6table_raw, rawtable_ops);
+ net->ipv6.ip6table_raw = NULL;
}
static struct pernet_operations ip6table_raw_net_ops = {
- .init = ip6table_raw_net_init,
.exit = ip6table_raw_net_exit,
};
@@ -54,28 +63,29 @@ static int __init ip6table_raw_init(void)
{
int ret;
+ /* Register hooks */
+ rawtable_ops = xt_hook_ops_alloc(&packet_raw, ip6table_raw_hook);
+ if (IS_ERR(rawtable_ops))
+ return PTR_ERR(rawtable_ops);
+
ret = register_pernet_subsys(&ip6table_raw_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(rawtable_ops);
return ret;
-
- /* Register hooks */
- rawtable_ops = xt_hook_link(&packet_raw, ip6table_raw_hook);
- if (IS_ERR(rawtable_ops)) {
- ret = PTR_ERR(rawtable_ops);
- goto cleanup_table;
}
- return ret;
-
- cleanup_table:
- unregister_pernet_subsys(&ip6table_raw_net_ops);
+ ret = ip6table_raw_table_init(&init_net);
+ if (ret) {
+ unregister_pernet_subsys(&ip6table_raw_net_ops);
+ kfree(rawtable_ops);
+ }
return ret;
}
static void __exit ip6table_raw_fini(void)
{
- xt_hook_unlink(&packet_raw, rawtable_ops);
unregister_pernet_subsys(&ip6table_raw_net_ops);
+ kfree(rawtable_ops);
}
module_init(ip6table_raw_init);
diff --git a/net/ipv6/netfilter/ip6table_security.c b/net/ipv6/netfilter/ip6table_security.c
index 0d856fedfeb0..cf26ccb04056 100644
--- a/net/ipv6/netfilter/ip6table_security.c
+++ b/net/ipv6/netfilter/ip6table_security.c
@@ -27,12 +27,15 @@ MODULE_DESCRIPTION("ip6tables security table, for MAC rules");
(1 << NF_INET_FORWARD) | \
(1 << NF_INET_LOCAL_OUT)
+static int __net_init ip6table_security_table_init(struct net *net);
+
static const struct xt_table security_table = {
.name = "security",
.valid_hooks = SECURITY_VALID_HOOKS,
.me = THIS_MODULE,
.af = NFPROTO_IPV6,
.priority = NF_IP6_PRI_SECURITY,
+ .table_init = ip6table_security_table_init,
};
static unsigned int
@@ -44,26 +47,32 @@ ip6table_security_hook(void *priv, struct sk_buff *skb,
static struct nf_hook_ops *sectbl_ops __read_mostly;
-static int __net_init ip6table_security_net_init(struct net *net)
+static int __net_init ip6table_security_table_init(struct net *net)
{
struct ip6t_replace *repl;
+ int ret;
+
+ if (net->ipv6.ip6table_security)
+ return 0;
repl = ip6t_alloc_initial_table(&security_table);
if (repl == NULL)
return -ENOMEM;
- net->ipv6.ip6table_security =
- ip6t_register_table(net, &security_table, repl);
+ ret = ip6t_register_table(net, &security_table, repl, sectbl_ops,
+ &net->ipv6.ip6table_security);
kfree(repl);
- return PTR_ERR_OR_ZERO(net->ipv6.ip6table_security);
+ return ret;
}
static void __net_exit ip6table_security_net_exit(struct net *net)
{
- ip6t_unregister_table(net, net->ipv6.ip6table_security);
+ if (!net->ipv6.ip6table_security)
+ return;
+ ip6t_unregister_table(net, net->ipv6.ip6table_security, sectbl_ops);
+ net->ipv6.ip6table_security = NULL;
}
static struct pernet_operations ip6table_security_net_ops = {
- .init = ip6table_security_net_init,
.exit = ip6table_security_net_exit,
};
@@ -71,27 +80,28 @@ static int __init ip6table_security_init(void)
{
int ret;
+ sectbl_ops = xt_hook_ops_alloc(&security_table, ip6table_security_hook);
+ if (IS_ERR(sectbl_ops))
+ return PTR_ERR(sectbl_ops);
+
ret = register_pernet_subsys(&ip6table_security_net_ops);
- if (ret < 0)
+ if (ret < 0) {
+ kfree(sectbl_ops);
return ret;
-
- sectbl_ops = xt_hook_link(&security_table, ip6table_security_hook);
- if (IS_ERR(sectbl_ops)) {
- ret = PTR_ERR(sectbl_ops);
- goto cleanup_table;
}
- return ret;
-
-cleanup_table:
- unregister_pernet_subsys(&ip6table_security_net_ops);
+ ret = ip6table_security_table_init(&init_net);
+ if (ret) {
+ unregister_pernet_subsys(&ip6table_security_net_ops);
+ kfree(sectbl_ops);
+ }
return ret;
}
static void __exit ip6table_security_fini(void)
{
- xt_hook_unlink(&security_table, sectbl_ops);
unregister_pernet_subsys(&ip6table_security_net_ops);
+ kfree(sectbl_ops);
}
module_init(ip6table_security_init);
diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c
index d5efeb87350e..e4347aeb2e65 100644
--- a/net/ipv6/netfilter/nf_conntrack_reasm.c
+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c
@@ -56,7 +56,6 @@ struct nf_ct_frag6_skb_cb
{
struct inet6_skb_parm h;
int offset;
- struct sk_buff *orig;
};
#define NFCT_FRAG6_CB(skb) ((struct nf_ct_frag6_skb_cb *)((skb)->cb))
@@ -170,12 +169,6 @@ static unsigned int nf_hashfn(const struct inet_frag_queue *q)
return nf_hash_frag(nq->id, &nq->saddr, &nq->daddr);
}
-static void nf_skb_free(struct sk_buff *skb)
-{
- if (NFCT_FRAG6_CB(skb)->orig)
- kfree_skb(NFCT_FRAG6_CB(skb)->orig);
-}
-
static void nf_ct_frag6_expire(unsigned long data)
{
struct frag_queue *fq;
@@ -190,7 +183,7 @@ static void nf_ct_frag6_expire(unsigned long data)
/* Creation primitives. */
static inline struct frag_queue *fq_find(struct net *net, __be32 id,
u32 user, struct in6_addr *src,
- struct in6_addr *dst, u8 ecn)
+ struct in6_addr *dst, int iif, u8 ecn)
{
struct inet_frag_queue *q;
struct ip6_create_arg arg;
@@ -200,6 +193,7 @@ static inline struct frag_queue *fq_find(struct net *net, __be32 id,
arg.user = user;
arg.src = src;
arg.dst = dst;
+ arg.iif = iif;
arg.ecn = ecn;
local_bh_disable();
@@ -368,17 +362,18 @@ err:
/*
* Check if this packet is complete.
- * Returns NULL on failure by any reason, and pointer
- * to current nexthdr field in reassembled frame.
*
* It is called with locked fq, and caller must check that
* queue is eligible for reassembly i.e. it is not COMPLETE,
* the last and the first frames arrived and all the bits are here.
+ *
+ * returns true if *prev skb has been transformed into the reassembled
+ * skb, false otherwise.
*/
-static struct sk_buff *
-nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
+static bool
+nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_device *dev)
{
- struct sk_buff *fp, *op, *head = fq->q.fragments;
+ struct sk_buff *fp, *head = fq->q.fragments;
int payload_len;
u8 ecn;
@@ -389,22 +384,21 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
ecn = ip_frag_ecn_table[fq->ecn];
if (unlikely(ecn == 0xff))
- goto out_fail;
+ return false;
/* Unfragmented part is taken from the first segment. */
payload_len = ((head->data - skb_network_header(head)) -
sizeof(struct ipv6hdr) + fq->q.len -
sizeof(struct frag_hdr));
if (payload_len > IPV6_MAXPLEN) {
- pr_debug("payload len is too large.\n");
- goto out_oversize;
+ net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n",
+ payload_len);
+ return false;
}
/* Head of list must not be cloned. */
- if (skb_unclone(head, GFP_ATOMIC)) {
- pr_debug("skb is cloned but can't expand head");
- goto out_oom;
- }
+ if (skb_unclone(head, GFP_ATOMIC))
+ return false;
/* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
@@ -415,7 +409,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
clone = alloc_skb(0, GFP_ATOMIC);
if (clone == NULL)
- goto out_oom;
+ return false;
clone->next = head->next;
head->next = clone;
@@ -429,10 +423,41 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
clone->csum = 0;
clone->ip_summed = head->ip_summed;
- NFCT_FRAG6_CB(clone)->orig = NULL;
add_frag_mem_limit(fq->q.net, clone->truesize);
}
+ /* morph head into last received skb: prev.
+ *
+ * This allows callers of ipv6 conntrack defrag to continue
+ * to use the last skb(frag) passed into the reasm engine.
+ * The last skb frag 'silently' turns into the full reassembled skb.
+ *
+ * Since prev is also part of q->fragments we have to clone it first.
+ */
+ if (head != prev) {
+ struct sk_buff *iter;
+
+ fp = skb_clone(prev, GFP_ATOMIC);
+ if (!fp)
+ return false;
+
+ fp->next = prev->next;
+
+ iter = head;
+ while (iter) {
+ if (iter->next == prev) {
+ iter->next = fp;
+ break;
+ }
+ iter = iter->next;
+ }
+
+ skb_morph(prev, head);
+ prev->next = head->next;
+ consume_skb(head);
+ head = prev;
+ }
+
/* We have to remove fragment header from datagram and to relocate
* header in order to calculate ICV correctly. */
skb_network_header(head)[fq->nhoffset] = skb_transport_header(head)[0];
@@ -473,31 +498,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct net_device *dev)
fq->q.fragments = NULL;
fq->q.fragments_tail = NULL;
- /* all original skbs are linked into the NFCT_FRAG6_CB(head).orig */
- fp = skb_shinfo(head)->frag_list;
- if (fp && NFCT_FRAG6_CB(fp)->orig == NULL)
- /* at above code, head skb is divided into two skbs. */
- fp = fp->next;
-
- op = NFCT_FRAG6_CB(head)->orig;
- for (; fp; fp = fp->next) {
- struct sk_buff *orig = NFCT_FRAG6_CB(fp)->orig;
-
- op->next = orig;
- op = orig;
- NFCT_FRAG6_CB(fp)->orig = NULL;
- }
-
- return head;
-
-out_oversize:
- net_dbg_ratelimited("nf_ct_frag6_reasm: payload len = %d\n",
- payload_len);
- goto out_fail;
-out_oom:
- net_dbg_ratelimited("nf_ct_frag6_reasm: no memory for reassembly\n");
-out_fail:
- return NULL;
+ return true;
}
/*
@@ -563,89 +564,61 @@ find_prev_fhdr(struct sk_buff *skb, u8 *prevhdrp, int *prevhoff, int *fhoff)
return 0;
}
-struct sk_buff *nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
+int nf_ct_frag6_gather(struct net *net, struct sk_buff *skb, u32 user)
{
- struct sk_buff *clone;
struct net_device *dev = skb->dev;
+ int fhoff, nhoff, ret;
struct frag_hdr *fhdr;
struct frag_queue *fq;
struct ipv6hdr *hdr;
- int fhoff, nhoff;
u8 prevhdr;
- struct sk_buff *ret_skb = NULL;
/* Jumbo payload inhibits frag. header */
if (ipv6_hdr(skb)->payload_len == 0) {
pr_debug("payload len = 0\n");
- return skb;
+ return -EINVAL;
}
if (find_prev_fhdr(skb, &prevhdr, &nhoff, &fhoff) < 0)
- return skb;
+ return -EINVAL;
- clone = skb_clone(skb, GFP_ATOMIC);
- if (clone == NULL) {
- pr_debug("Can't clone skb\n");
- return skb;
- }
+ if (!pskb_may_pull(skb, fhoff + sizeof(*fhdr)))
+ return -ENOMEM;
- NFCT_FRAG6_CB(clone)->orig = skb;
-
- if (!pskb_may_pull(clone, fhoff + sizeof(*fhdr))) {
- pr_debug("message is too short.\n");
- goto ret_orig;
- }
-
- skb_set_transport_header(clone, fhoff);
- hdr = ipv6_hdr(clone);
- fhdr = (struct frag_hdr *)skb_transport_header(clone);
+ skb_set_transport_header(skb, fhoff);
+ hdr = ipv6_hdr(skb);
+ fhdr = (struct frag_hdr *)skb_transport_header(skb);
fq = fq_find(net, fhdr->identification, user, &hdr->saddr, &hdr->daddr,
- ip6_frag_ecn(hdr));
+ skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
if (fq == NULL) {
pr_debug("Can't find and can't create new queue\n");
- goto ret_orig;
+ return -ENOMEM;
}
spin_lock_bh(&fq->q.lock);
- if (nf_ct_frag6_queue(fq, clone, fhdr, nhoff) < 0) {
- spin_unlock_bh(&fq->q.lock);
- pr_debug("Can't insert skb to queue\n");
- inet_frag_put(&fq->q, &nf_frags);
- goto ret_orig;
+ if (nf_ct_frag6_queue(fq, skb, fhdr, nhoff) < 0) {
+ ret = -EINVAL;
+ goto out_unlock;
}
+ /* after queue has assumed skb ownership, only 0 or -EINPROGRESS
+ * must be returned.
+ */
+ ret = -EINPROGRESS;
if (fq->q.flags == (INET_FRAG_FIRST_IN | INET_FRAG_LAST_IN) &&
- fq->q.meat == fq->q.len) {
- ret_skb = nf_ct_frag6_reasm(fq, dev);
- if (ret_skb == NULL)
- pr_debug("Can't reassemble fragmented packets\n");
- }
- spin_unlock_bh(&fq->q.lock);
+ fq->q.meat == fq->q.len &&
+ nf_ct_frag6_reasm(fq, skb, dev))
+ ret = 0;
+out_unlock:
+ spin_unlock_bh(&fq->q.lock);
inet_frag_put(&fq->q, &nf_frags);
- return ret_skb;
-
-ret_orig:
- kfree_skb(clone);
- return skb;
+ return ret;
}
EXPORT_SYMBOL_GPL(nf_ct_frag6_gather);
-void nf_ct_frag6_consume_orig(struct sk_buff *skb)
-{
- struct sk_buff *s, *s2;
-
- for (s = NFCT_FRAG6_CB(skb)->orig; s;) {
- s2 = s->next;
- s->next = NULL;
- consume_skb(s);
- s = s2;
- }
-}
-EXPORT_SYMBOL_GPL(nf_ct_frag6_consume_orig);
-
static int nf_ct_net_init(struct net *net)
{
int res;
@@ -680,7 +653,6 @@ int nf_ct_frag6_init(void)
nf_frags.hashfn = nf_hashfn;
nf_frags.constructor = ip6_frag_init;
nf_frags.destructor = NULL;
- nf_frags.skb_free = nf_skb_free;
nf_frags.qsize = sizeof(struct frag_queue);
nf_frags.match = ip6_frag_match;
nf_frags.frag_expire = nf_ct_frag6_expire;
diff --git a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
index 4fdbed5ebfb6..f7aab5ab93a5 100644
--- a/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
+++ b/net/ipv6/netfilter/nf_defrag_ipv6_hooks.c
@@ -55,7 +55,7 @@ static unsigned int ipv6_defrag(void *priv,
struct sk_buff *skb,
const struct nf_hook_state *state)
{
- struct sk_buff *reasm;
+ int err;
#if IS_ENABLED(CONFIG_NF_CONNTRACK)
/* Previously seen (loopback)? */
@@ -63,23 +63,13 @@ static unsigned int ipv6_defrag(void *priv,
return NF_ACCEPT;
#endif
- reasm = nf_ct_frag6_gather(state->net, skb,
- nf_ct6_defrag_user(state->hook, skb));
+ err = nf_ct_frag6_gather(state->net, skb,
+ nf_ct6_defrag_user(state->hook, skb));
/* queued */
- if (reasm == NULL)
+ if (err == -EINPROGRESS)
return NF_STOLEN;
- /* error occurred or not fragmented */
- if (reasm == skb)
- return NF_ACCEPT;
-
- nf_ct_frag6_consume_orig(reasm);
-
- NF_HOOK_THRESH(NFPROTO_IPV6, state->hook, state->net, state->sk, reasm,
- state->in, state->out,
- state->okfn, NF_IP6_PRI_CONNTRACK_DEFRAG + 1);
-
- return NF_STOLEN;
+ return NF_ACCEPT;
}
static struct nf_hook_ops ipv6_defrag_ops[] = {
diff --git a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
index 238e70c3f7b7..e0be97e636a4 100644
--- a/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_l3proto_ipv6.c
@@ -131,28 +131,15 @@ static void nf_nat_ipv6_csum_recalc(struct sk_buff *skb,
u8 proto, void *data, __sum16 *check,
int datalen, int oldlen)
{
- const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
- struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
-
if (skb->ip_summed != CHECKSUM_PARTIAL) {
- if (!(rt->rt6i_flags & RTF_LOCAL) &&
- (!skb->dev || skb->dev->features & NETIF_F_V6_CSUM)) {
- skb->ip_summed = CHECKSUM_PARTIAL;
- skb->csum_start = skb_headroom(skb) +
- skb_network_offset(skb) +
- (data - (void *)skb->data);
- skb->csum_offset = (void *)check - data;
- *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
- datalen, proto, 0);
- } else {
- *check = 0;
- *check = csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
- datalen, proto,
- csum_partial(data, datalen,
- 0));
- if (proto == IPPROTO_UDP && !*check)
- *check = CSUM_MANGLED_0;
- }
+ const struct ipv6hdr *ipv6h = ipv6_hdr(skb);
+
+ skb->ip_summed = CHECKSUM_PARTIAL;
+ skb->csum_start = skb_headroom(skb) + skb_network_offset(skb) +
+ (data - (void *)skb->data);
+ skb->csum_offset = (void *)check - data;
+ *check = ~csum_ipv6_magic(&ipv6h->saddr, &ipv6h->daddr,
+ datalen, proto, 0);
} else
inet_proto_csum_replace2(check, skb,
htons(oldlen), htons(datalen), true);
diff --git a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
index 31ba7ca19757..051b6a6bfff6 100644
--- a/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
+++ b/net/ipv6/netfilter/nf_nat_masquerade_ipv6.c
@@ -21,6 +21,10 @@
#include <net/ipv6.h>
#include <net/netfilter/ipv6/nf_nat_masquerade.h>
+#define MAX_WORK_COUNT 16
+
+static atomic_t v6_worker_count;
+
unsigned int
nf_nat_masquerade_ipv6(struct sk_buff *skb, const struct nf_nat_range *range,
const struct net_device *out)
@@ -78,14 +82,78 @@ static struct notifier_block masq_dev_notifier = {
.notifier_call = masq_device_event,
};
+struct masq_dev_work {
+ struct work_struct work;
+ struct net *net;
+ int ifindex;
+};
+
+static void iterate_cleanup_work(struct work_struct *work)
+{
+ struct masq_dev_work *w;
+ long index;
+
+ w = container_of(work, struct masq_dev_work, work);
+
+ index = w->ifindex;
+ nf_ct_iterate_cleanup(w->net, device_cmp, (void *)index, 0, 0);
+
+ put_net(w->net);
+ kfree(w);
+ atomic_dec(&v6_worker_count);
+ module_put(THIS_MODULE);
+}
+
+/* ipv6 inet notifier is an atomic notifier, i.e. we cannot
+ * schedule.
+ *
+ * Unfortunately, nf_ct_iterate_cleanup can run for a long
+ * time if there are lots of conntracks and the system
+ * handles high softirq load, so it frequently calls cond_resched
+ * while iterating the conntrack table.
+ *
+ * So we defer nf_ct_iterate_cleanup walk to the system workqueue.
+ *
+ * As we can have 'a lot' of inet_events (depending on amount
+ * of ipv6 addresses being deleted), we also need to add an upper
+ * limit to the number of queued work items.
+ */
static int masq_inet_event(struct notifier_block *this,
unsigned long event, void *ptr)
{
struct inet6_ifaddr *ifa = ptr;
- struct netdev_notifier_info info;
+ const struct net_device *dev;
+ struct masq_dev_work *w;
+ struct net *net;
+
+ if (event != NETDEV_DOWN ||
+ atomic_read(&v6_worker_count) >= MAX_WORK_COUNT)
+ return NOTIFY_DONE;
+
+ dev = ifa->idev->dev;
+ net = maybe_get_net(dev_net(dev));
+ if (!net)
+ return NOTIFY_DONE;
- netdev_notifier_info_init(&info, ifa->idev->dev);
- return masq_device_event(this, event, &info);
+ if (!try_module_get(THIS_MODULE))
+ goto err_module;
+
+ w = kmalloc(sizeof(*w), GFP_ATOMIC);
+ if (w) {
+ atomic_inc(&v6_worker_count);
+
+ INIT_WORK(&w->work, iterate_cleanup_work);
+ w->ifindex = dev->ifindex;
+ w->net = net;
+ schedule_work(&w->work);
+
+ return NOTIFY_DONE;
+ }
+
+ module_put(THIS_MODULE);
+ err_module:
+ put_net(net);
+ return NOTIFY_DONE;
}
static struct notifier_block masq_inet_notifier = {
diff --git a/net/ipv6/netfilter/nf_reject_ipv6.c b/net/ipv6/netfilter/nf_reject_ipv6.c
index e0f922b777e3..4709f657b7b6 100644
--- a/net/ipv6/netfilter/nf_reject_ipv6.c
+++ b/net/ipv6/netfilter/nf_reject_ipv6.c
@@ -14,7 +14,6 @@
#include <net/netfilter/ipv6/nf_reject.h>
#include <linux/netfilter_ipv6.h>
#include <linux/netfilter_bridge.h>
-#include <net/netfilter/ipv6/nf_reject.h>
const struct tcphdr *nf_reject_ip6_tcphdr_get(struct sk_buff *oldskb,
struct tcphdr *otcph,
diff --git a/net/ipv6/netfilter/nf_tables_ipv6.c b/net/ipv6/netfilter/nf_tables_ipv6.c
index 120ea9131be0..30b22f4dff55 100644
--- a/net/ipv6/netfilter/nf_tables_ipv6.c
+++ b/net/ipv6/netfilter/nf_tables_ipv6.c
@@ -77,7 +77,7 @@ err:
static void nf_tables_ipv6_exit_net(struct net *net)
{
- nft_unregister_afinfo(net->nft.ipv6);
+ nft_unregister_afinfo(net, net->nft.ipv6);
kfree(net->nft.ipv6);
}
diff --git a/net/ipv6/netfilter/nft_masq_ipv6.c b/net/ipv6/netfilter/nft_masq_ipv6.c
index cd1ac1637a05..9597ffb74077 100644
--- a/net/ipv6/netfilter/nft_masq_ipv6.c
+++ b/net/ipv6/netfilter/nft_masq_ipv6.c
@@ -26,7 +26,12 @@ static void nft_masq_ipv6_eval(const struct nft_expr *expr,
memset(&range, 0, sizeof(range));
range.flags = priv->flags;
-
+ if (priv->sreg_proto_min) {
+ range.min_proto.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_min];
+ range.max_proto.all =
+ *(__be16 *)&regs->data[priv->sreg_proto_max];
+ }
regs->verdict.code = nf_nat_masquerade_ipv6(pkt->skb, &range, pkt->out);
}
diff --git a/net/ipv6/ping.c b/net/ipv6/ping.c
index 263a5164a6f5..c382db7a2e73 100644
--- a/net/ipv6/ping.c
+++ b/net/ipv6/ping.c
@@ -26,35 +26,6 @@
#include <net/transp_v6.h>
#include <net/ping.h>
-struct proto pingv6_prot = {
- .name = "PINGv6",
- .owner = THIS_MODULE,
- .init = ping_init_sock,
- .close = ping_close,
- .connect = ip6_datagram_connect_v6_only,
- .disconnect = udp_disconnect,
- .setsockopt = ipv6_setsockopt,
- .getsockopt = ipv6_getsockopt,
- .sendmsg = ping_v6_sendmsg,
- .recvmsg = ping_recvmsg,
- .bind = ping_bind,
- .backlog_rcv = ping_queue_rcv_skb,
- .hash = ping_hash,
- .unhash = ping_unhash,
- .get_port = ping_get_port,
- .obj_size = sizeof(struct raw6_sock),
-};
-EXPORT_SYMBOL_GPL(pingv6_prot);
-
-static struct inet_protosw pingv6_protosw = {
- .type = SOCK_DGRAM,
- .protocol = IPPROTO_ICMPV6,
- .prot = &pingv6_prot,
- .ops = &inet6_dgram_ops,
- .flags = INET_PROTOSW_REUSE,
-};
-
-
/* Compatibility glue so we can support IPv6 when it's compiled as a module */
static int dummy_ipv6_recv_error(struct sock *sk, struct msghdr *msg, int len,
int *addr_len)
@@ -77,7 +48,7 @@ static int dummy_ipv6_chk_addr(struct net *net, const struct in6_addr *addr,
return 0;
}
-int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
+static int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
struct inet_sock *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk);
@@ -192,6 +163,34 @@ int ping_v6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
return len;
}
+struct proto pingv6_prot = {
+ .name = "PINGv6",
+ .owner = THIS_MODULE,
+ .init = ping_init_sock,
+ .close = ping_close,
+ .connect = ip6_datagram_connect_v6_only,
+ .disconnect = udp_disconnect,
+ .setsockopt = ipv6_setsockopt,
+ .getsockopt = ipv6_getsockopt,
+ .sendmsg = ping_v6_sendmsg,
+ .recvmsg = ping_recvmsg,
+ .bind = ping_bind,
+ .backlog_rcv = ping_queue_rcv_skb,
+ .hash = ping_hash,
+ .unhash = ping_unhash,
+ .get_port = ping_get_port,
+ .obj_size = sizeof(struct raw6_sock),
+};
+EXPORT_SYMBOL_GPL(pingv6_prot);
+
+static struct inet_protosw pingv6_protosw = {
+ .type = SOCK_DGRAM,
+ .protocol = IPPROTO_ICMPV6,
+ .prot = &pingv6_prot,
+ .ops = &inet6_dgram_ops,
+ .flags = INET_PROTOSW_REUSE,
+};
+
#ifdef CONFIG_PROC_FS
static void *ping_v6_seq_start(struct seq_file *seq, loff_t *pos)
{
diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c
index dc65ec198f7c..fa59dd7a427e 100644
--- a/net/ipv6/raw.c
+++ b/net/ipv6/raw.c
@@ -733,6 +733,7 @@ static int raw6_getfrag(void *from, char *to, int offset, int len, int odd,
static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
{
+ struct ipv6_txoptions *opt_to_free = NULL;
struct ipv6_txoptions opt_space;
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
struct in6_addr *daddr, *final_p, final;
@@ -839,8 +840,10 @@ static int rawv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
if (!(opt->opt_nflen|opt->opt_flen))
opt = NULL;
}
- if (!opt)
- opt = np->opt;
+ if (!opt) {
+ opt = txopt_get(np);
+ opt_to_free = opt;
+ }
if (flowlabel)
opt = fl6_merge_options(&opt_space, flowlabel, opt);
opt = ipv6_fixup_options(&opt_space, opt);
@@ -906,6 +909,7 @@ done:
dst_release(dst);
out:
fl6_sock_release(flowlabel);
+ txopt_put(opt_to_free);
return err < 0 ? err : len;
do_confirm:
dst_confirm(dst);
@@ -968,6 +972,11 @@ static int do_rawv6_setsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
switch (optname) {
+ case IPV6_HDRINCL:
+ if (sk->sk_type != SOCK_RAW)
+ return -EINVAL;
+ inet_sk(sk)->hdrincl = !!val;
+ return 0;
case IPV6_CHECKSUM:
if (inet_sk(sk)->inet_num == IPPROTO_ICMPV6 &&
level == IPPROTO_IPV6) {
@@ -1012,7 +1021,8 @@ static int rawv6_setsockopt(struct sock *sk, int level, int optname,
return -EOPNOTSUPP;
return rawv6_seticmpfilter(sk, level, optname, optval, optlen);
case SOL_IPV6:
- if (optname == IPV6_CHECKSUM)
+ if (optname == IPV6_CHECKSUM ||
+ optname == IPV6_HDRINCL)
break;
default:
return ipv6_setsockopt(sk, level, optname, optval, optlen);
@@ -1033,7 +1043,8 @@ static int compat_rawv6_setsockopt(struct sock *sk, int level, int optname,
return -EOPNOTSUPP;
return rawv6_seticmpfilter(sk, level, optname, optval, optlen);
case SOL_IPV6:
- if (optname == IPV6_CHECKSUM)
+ if (optname == IPV6_CHECKSUM ||
+ optname == IPV6_HDRINCL)
break;
default:
return compat_ipv6_setsockopt(sk, level, optname,
@@ -1053,6 +1064,9 @@ static int do_rawv6_getsockopt(struct sock *sk, int level, int optname,
return -EFAULT;
switch (optname) {
+ case IPV6_HDRINCL:
+ val = inet_sk(sk)->hdrincl;
+ break;
case IPV6_CHECKSUM:
/*
* We allow getsockopt() for IPPROTO_IPV6-level
@@ -1090,7 +1104,8 @@ static int rawv6_getsockopt(struct sock *sk, int level, int optname,
return -EOPNOTSUPP;
return rawv6_geticmpfilter(sk, level, optname, optval, optlen);
case SOL_IPV6:
- if (optname == IPV6_CHECKSUM)
+ if (optname == IPV6_CHECKSUM ||
+ optname == IPV6_HDRINCL)
break;
default:
return ipv6_getsockopt(sk, level, optname, optval, optlen);
@@ -1111,7 +1126,8 @@ static int compat_rawv6_getsockopt(struct sock *sk, int level, int optname,
return -EOPNOTSUPP;
return rawv6_geticmpfilter(sk, level, optname, optval, optlen);
case SOL_IPV6:
- if (optname == IPV6_CHECKSUM)
+ if (optname == IPV6_CHECKSUM ||
+ optname == IPV6_HDRINCL)
break;
default:
return compat_ipv6_getsockopt(sk, level, optname,
diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c
index 44e21a03cfc3..e2ea31175ef9 100644
--- a/net/ipv6/reassembly.c
+++ b/net/ipv6/reassembly.c
@@ -108,7 +108,10 @@ bool ip6_frag_match(const struct inet_frag_queue *q, const void *a)
return fq->id == arg->id &&
fq->user == arg->user &&
ipv6_addr_equal(&fq->saddr, arg->src) &&
- ipv6_addr_equal(&fq->daddr, arg->dst);
+ ipv6_addr_equal(&fq->daddr, arg->dst) &&
+ (arg->iif == fq->iif ||
+ !(ipv6_addr_type(arg->dst) & (IPV6_ADDR_MULTICAST |
+ IPV6_ADDR_LINKLOCAL)));
}
EXPORT_SYMBOL(ip6_frag_match);
@@ -180,7 +183,7 @@ static void ip6_frag_expire(unsigned long data)
static struct frag_queue *
fq_find(struct net *net, __be32 id, const struct in6_addr *src,
- const struct in6_addr *dst, u8 ecn)
+ const struct in6_addr *dst, int iif, u8 ecn)
{
struct inet_frag_queue *q;
struct ip6_create_arg arg;
@@ -190,6 +193,7 @@ fq_find(struct net *net, __be32 id, const struct in6_addr *src,
arg.user = IP6_DEFRAG_LOCAL_DELIVER;
arg.src = src;
arg.dst = dst;
+ arg.iif = iif;
arg.ecn = ecn;
hash = inet6_hash_frag(id, src, dst);
@@ -492,10 +496,8 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
IP6CB(head)->flags |= IP6SKB_FRAGMENTED;
/* Yes, and fold redundant checksum back. 8) */
- if (head->ip_summed == CHECKSUM_COMPLETE)
- head->csum = csum_partial(skb_network_header(head),
- skb_network_header_len(head),
- head->csum);
+ skb_postpush_rcsum(head, skb_network_header(head),
+ skb_network_header_len(head));
rcu_read_lock();
IP6_INC_STATS_BH(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
@@ -551,7 +553,7 @@ static int ipv6_frag_rcv(struct sk_buff *skb)
}
fq = fq_find(net, fhdr->identification, &hdr->saddr, &hdr->daddr,
- ip6_frag_ecn(hdr));
+ skb->dev ? skb->dev->ifindex : 0, ip6_frag_ecn(hdr));
if (fq) {
int ret;
@@ -751,7 +753,6 @@ int __init ipv6_frag_init(void)
ip6_frags.hashfn = ip6_hashfn;
ip6_frags.constructor = ip6_frag_init;
ip6_frags.destructor = NULL;
- ip6_frags.skb_free = NULL;
ip6_frags.qsize = sizeof(struct frag_queue);
ip6_frags.match = ip6_frag_match;
ip6_frags.frag_expire = ip6_frag_expire;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index c8bc9b4ac328..ed446639219c 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -62,6 +62,7 @@
#include <net/lwtunnel.h>
#include <net/ip_tunnels.h>
#include <net/l3mdev.h>
+#include <trace/events/fib6.h>
#include <asm/uaccess.h>
@@ -404,6 +405,14 @@ static void ip6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
}
}
+static bool __rt6_check_expired(const struct rt6_info *rt)
+{
+ if (rt->rt6i_flags & RTF_EXPIRES)
+ return time_after(jiffies, rt->dst.expires);
+ else
+ return false;
+}
+
static bool rt6_check_expired(const struct rt6_info *rt)
{
if (rt->rt6i_flags & RTF_EXPIRES) {
@@ -515,7 +524,7 @@ static void rt6_probe_deferred(struct work_struct *w)
container_of(w, struct __rt6_probe_work, work);
addrconf_addr_solict_mult(&work->target, &mcaddr);
- ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL, NULL);
+ ndisc_send_ns(work->dev, &work->target, &mcaddr, NULL);
dev_put(work->dev);
kfree(work);
}
@@ -857,6 +866,9 @@ restart:
}
dst_use(&rt->dst, jiffies);
read_unlock_bh(&table->tb6_lock);
+
+ trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
+
return rt;
}
@@ -1070,6 +1082,8 @@ redo_rt6_select:
read_unlock_bh(&table->tb6_lock);
rt6_dst_from_metrics_check(rt);
+
+ trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
return rt;
} else if (unlikely((fl6->flowi6_flags & FLOWI_FLAG_KNOWN_NH) &&
!(rt->rt6i_flags & RTF_GATEWAY))) {
@@ -1093,6 +1107,8 @@ redo_rt6_select:
uncached_rt = net->ipv6.ip6_null_entry;
dst_hold(&uncached_rt->dst);
+
+ trace_fib6_table_lookup(net, uncached_rt, table->tb6_id, fl6);
return uncached_rt;
} else {
@@ -1117,6 +1133,7 @@ redo_rt6_select:
dst_release(&rt->dst);
}
+ trace_fib6_table_lookup(net, pcpu_rt, table->tb6_id, fl6);
return pcpu_rt;
}
@@ -1166,11 +1183,10 @@ static struct rt6_info *ip6_pol_route_output(struct net *net, struct fib6_table
return ip6_pol_route(net, table, fl6->flowi6_oif, fl6, flags);
}
-struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
- struct flowi6 *fl6)
+struct dst_entry *ip6_route_output_flags(struct net *net, const struct sock *sk,
+ struct flowi6 *fl6, int flags)
{
struct dst_entry *dst;
- int flags = 0;
bool any_src;
dst = l3mdev_rt6_dst_by_oif(net, fl6);
@@ -1191,7 +1207,7 @@ struct dst_entry *ip6_route_output(struct net *net, const struct sock *sk,
return fib6_rule_lookup(net, fl6, flags, ip6_pol_route_output);
}
-EXPORT_SYMBOL(ip6_route_output);
+EXPORT_SYMBOL_GPL(ip6_route_output_flags);
struct dst_entry *ip6_blackhole_route(struct net *net, struct dst_entry *dst_orig)
{
@@ -1252,7 +1268,8 @@ static struct dst_entry *rt6_check(struct rt6_info *rt, u32 cookie)
static struct dst_entry *rt6_dst_from_check(struct rt6_info *rt, u32 cookie)
{
- if (rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
+ if (!__rt6_check_expired(rt) &&
+ rt->dst.obsolete == DST_OBSOLETE_FORCE_CHK &&
rt6_check((struct rt6_info *)(rt->dst.from), cookie))
return &rt->dst;
else
@@ -1272,7 +1289,8 @@ static struct dst_entry *ip6_dst_check(struct dst_entry *dst, u32 cookie)
rt6_dst_from_metrics_check(rt);
- if ((rt->rt6i_flags & RTF_PCPU) || unlikely(dst->flags & DST_NOCACHE))
+ if (rt->rt6i_flags & RTF_PCPU ||
+ (unlikely(dst->flags & DST_NOCACHE) && rt->dst.from))
return rt6_dst_from_check(rt, cookie);
else
return rt6_check(rt, cookie);
@@ -1322,6 +1340,12 @@ static void rt6_do_update_pmtu(struct rt6_info *rt, u32 mtu)
rt6_update_expires(rt, net->ipv6.sysctl.ip6_rt_mtu_expires);
}
+static bool rt6_cache_allowed_for_pmtu(const struct rt6_info *rt)
+{
+ return !(rt->rt6i_flags & RTF_CACHE) &&
+ (rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
+}
+
static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
const struct ipv6hdr *iph, u32 mtu)
{
@@ -1335,7 +1359,7 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
if (mtu >= dst_mtu(dst))
return;
- if (rt6->rt6i_flags & RTF_CACHE) {
+ if (!rt6_cache_allowed_for_pmtu(rt6)) {
rt6_do_update_pmtu(rt6, mtu);
} else {
const struct in6_addr *daddr, *saddr;
@@ -1458,6 +1482,7 @@ out:
read_unlock_bh(&table->tb6_lock);
+ trace_fib6_table_lookup(net, rt, table->tb6_id, fl6);
return rt;
};
@@ -2683,6 +2708,7 @@ static const struct nla_policy rtm_ipv6_policy[RTA_MAX+1] = {
[RTA_PREF] = { .type = NLA_U8 },
[RTA_ENCAP_TYPE] = { .type = NLA_U16 },
[RTA_ENCAP] = { .type = NLA_NESTED },
+ [RTA_EXPIRES] = { .type = NLA_U32 },
};
static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
@@ -2783,6 +2809,15 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct nlmsghdr *nlh,
if (tb[RTA_ENCAP_TYPE])
cfg->fc_encap_type = nla_get_u16(tb[RTA_ENCAP_TYPE]);
+ if (tb[RTA_EXPIRES]) {
+ unsigned long timeout = addrconf_timeout_fixup(nla_get_u32(tb[RTA_EXPIRES]), HZ);
+
+ if (addrconf_finite_timeout(timeout)) {
+ cfg->fc_expires = jiffies_to_clock_t(timeout * HZ);
+ cfg->fc_flags |= RTF_EXPIRES;
+ }
+ }
+
err = 0;
errout:
return err;
diff --git a/net/ipv6/sit.c b/net/ipv6/sit.c
index dcccae86190f..83384308d032 100644
--- a/net/ipv6/sit.c
+++ b/net/ipv6/sit.c
@@ -201,14 +201,14 @@ static int ipip6_tunnel_create(struct net_device *dev)
if ((__force u16)t->parms.i_flags & SIT_ISATAP)
dev->priv_flags |= IFF_ISATAP;
+ dev->rtnl_link_ops = &sit_link_ops;
+
err = register_netdevice(dev);
if (err < 0)
goto out;
ipip6_tunnel_clone_6rd(dev, sitn);
- dev->rtnl_link_ops = &sit_link_ops;
-
dev_hold(dev);
ipip6_tunnel_link(sitn, t);
@@ -475,7 +475,7 @@ static void ipip6_tunnel_uninit(struct net_device *dev)
ipip6_tunnel_unlink(sitn, tunnel);
ipip6_tunnel_del_prl(tunnel, NULL);
}
- ip_tunnel_dst_reset_all(tunnel);
+ dst_cache_reset(&tunnel->dst_cache);
dev_put(dev);
}
@@ -681,14 +681,16 @@ static int ipip6_rcv(struct sk_buff *skb)
skb->mac_header = skb->network_header;
skb_reset_network_header(skb);
IPCB(skb)->flags = 0;
- skb->protocol = htons(ETH_P_IPV6);
+ skb->dev = tunnel->dev;
if (packet_is_spoofed(skb, iph, tunnel)) {
tunnel->dev->stats.rx_errors++;
goto out;
}
- __skb_tunnel_rx(skb, tunnel->dev, tunnel->net);
+ if (iptunnel_pull_header(skb, 0, htons(ETH_P_IPV6),
+ !net_eq(tunnel->net, dev_net(tunnel->dev))))
+ goto out;
err = IP_ECN_decapsulate(iph, skb);
if (unlikely(err)) {
@@ -740,7 +742,7 @@ static int ipip_rcv(struct sk_buff *skb)
if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
goto drop;
- if (iptunnel_pull_header(skb, 0, tpi.proto))
+ if (iptunnel_pull_header(skb, 0, tpi.proto, false))
goto drop;
return ip_tunnel_rcv(tunnel, skb, &tpi, NULL, log_ecn_error);
}
@@ -820,7 +822,6 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
const struct in6_addr *addr6;
int addr_type;
u8 ttl;
- int err;
u8 protocol = IPPROTO_IPV6;
int t_hlen = tunnel->hlen + sizeof(struct iphdr);
@@ -912,7 +913,7 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
goto tx_error;
}
- skb = iptunnel_handle_offloads(skb, false, SKB_GSO_SIT);
+ skb = iptunnel_handle_offloads(skb, SKB_GSO_SIT);
if (IS_ERR(skb)) {
ip_rt_put(rt);
goto out;
@@ -983,10 +984,8 @@ static netdev_tx_t ipip6_tunnel_xmit(struct sk_buff *skb,
skb_set_inner_ipproto(skb, IPPROTO_IPV6);
- err = iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr,
- protocol, tos, ttl, df,
- !net_eq(tunnel->net, dev_net(dev)));
- iptunnel_xmit_stats(err, &dev->stats, dev->tstats);
+ iptunnel_xmit(NULL, rt, skb, fl4.saddr, fl4.daddr, protocol, tos, ttl,
+ df, !net_eq(tunnel->net, dev_net(dev)));
return NETDEV_TX_OK;
tx_error_icmp:
@@ -1003,7 +1002,7 @@ static netdev_tx_t ipip_tunnel_xmit(struct sk_buff *skb, struct net_device *dev)
struct ip_tunnel *tunnel = netdev_priv(dev);
const struct iphdr *tiph = &tunnel->parms.iph;
- skb = iptunnel_handle_offloads(skb, false, SKB_GSO_IPIP);
+ skb = iptunnel_handle_offloads(skb, SKB_GSO_IPIP);
if (IS_ERR(skb))
goto out;
@@ -1096,7 +1095,7 @@ static void ipip6_tunnel_update(struct ip_tunnel *t, struct ip_tunnel_parm *p)
t->parms.link = p->link;
ipip6_tunnel_bind_dev(t->dev);
}
- ip_tunnel_dst_reset_all(t);
+ dst_cache_reset(&t->dst_cache);
netdev_state_change(t->dev);
}
@@ -1127,7 +1126,7 @@ static int ipip6_tunnel_update_6rd(struct ip_tunnel *t,
t->ip6rd.relay_prefix = relay_prefix;
t->ip6rd.prefixlen = ip6rd->prefixlen;
t->ip6rd.relay_prefixlen = ip6rd->relay_prefixlen;
- ip_tunnel_dst_reset_all(t);
+ dst_cache_reset(&t->dst_cache);
netdev_state_change(t->dev);
return 0;
}
@@ -1281,7 +1280,7 @@ ipip6_tunnel_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd)
err = ipip6_tunnel_add_prl(t, &prl, cmd == SIOCCHGPRL);
break;
}
- ip_tunnel_dst_reset_all(t);
+ dst_cache_reset(&t->dst_cache);
netdev_state_change(dev);
break;
@@ -1342,7 +1341,7 @@ static void ipip6_dev_free(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
- free_percpu(tunnel->dst_cache);
+ dst_cache_destroy(&tunnel->dst_cache);
free_percpu(dev->tstats);
free_netdev(dev);
}
@@ -1375,6 +1374,7 @@ static void ipip6_tunnel_setup(struct net_device *dev)
static int ipip6_tunnel_init(struct net_device *dev)
{
struct ip_tunnel *tunnel = netdev_priv(dev);
+ int err;
tunnel->dev = dev;
tunnel->net = dev_net(dev);
@@ -1385,10 +1385,10 @@ static int ipip6_tunnel_init(struct net_device *dev)
if (!dev->tstats)
return -ENOMEM;
- tunnel->dst_cache = alloc_percpu(struct ip_tunnel_dst);
- if (!tunnel->dst_cache) {
+ err = dst_cache_init(&tunnel->dst_cache, GFP_KERNEL);
+ if (err) {
free_percpu(dev->tstats);
- return -ENOMEM;
+ return err;
}
return 0;
diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
index bb8f2fa1c7fb..aab91fa86c5e 100644
--- a/net/ipv6/syncookies.c
+++ b/net/ipv6/syncookies.c
@@ -41,8 +41,7 @@ static __u16 const msstab[] = {
9000 - 60,
};
-static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
- ipv6_cookie_scratch);
+static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], ipv6_cookie_scratch);
static u32 cookie_hash(const struct in6_addr *saddr, const struct in6_addr *daddr,
__be16 sport, __be16 dport, u32 count, int c)
@@ -148,7 +147,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
struct dst_entry *dst;
__u8 rcv_wscale;
- if (!sysctl_tcp_syncookies || !th->ack || th->rst)
+ if (!sock_net(sk)->ipv4.sysctl_tcp_syncookies || !th->ack || th->rst)
goto out;
if (tcp_synq_no_recent_overflow(sk))
@@ -193,7 +192,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
ireq->pktopts = skb;
}
- ireq->ir_iif = sk->sk_bound_dev_if;
+ ireq->ir_iif = inet_request_bound_dev_if(sk, skb);
/* So that link locals have meaning */
if (!sk->sk_bound_dev_if &&
ipv6_addr_type(&ireq->ir_v6_rmt_addr) & IPV6_ADDR_LINKLOCAL)
@@ -222,9 +221,9 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
memset(&fl6, 0, sizeof(fl6));
fl6.flowi6_proto = IPPROTO_TCP;
fl6.daddr = ireq->ir_v6_rmt_addr;
- final_p = fl6_update_dst(&fl6, np->opt, &final);
+ final_p = fl6_update_dst(&fl6, rcu_dereference(np->opt), &final);
fl6.saddr = ireq->ir_v6_loc_addr;
- fl6.flowi6_oif = sk->sk_bound_dev_if;
+ fl6.flowi6_oif = ireq->ir_iif;
fl6.flowi6_mark = ireq->ir_mark;
fl6.fl6_dport = ireq->ir_rmt_port;
fl6.fl6_sport = inet_sk(sk)->inet_sport;
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index ea2f4d5440b5..711d209f9124 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -61,13 +61,12 @@
#include <net/timewait_sock.h>
#include <net/inet_common.h>
#include <net/secure_seq.h>
-#include <net/tcp_memcontrol.h>
#include <net/busy_poll.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
-#include <linux/crypto.h>
+#include <crypto/hash.h>
#include <linux/scatterlist.h>
static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb);
@@ -93,10 +92,9 @@ static void inet6_sk_rx_dst_set(struct sock *sk, const struct sk_buff *skb)
{
struct dst_entry *dst = skb_dst(skb);
- if (dst) {
+ if (dst && dst_hold_safe(dst)) {
const struct rt6_info *rt = (const struct rt6_info *)dst;
- dst_hold(dst);
sk->sk_rx_dst = dst;
inet_sk(sk)->rx_dst_ifindex = skb->skb_iif;
inet6_sk(sk)->rx_dst_cookie = rt6_get_cookie(rt);
@@ -120,6 +118,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
struct ipv6_pinfo *np = inet6_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct in6_addr *saddr = NULL, *final_p, final;
+ struct ipv6_txoptions *opt;
struct flowi6 fl6;
struct dst_entry *dst;
int addr_type;
@@ -235,7 +234,8 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
fl6.fl6_dport = usin->sin6_port;
fl6.fl6_sport = inet->inet_sport;
- final_p = fl6_update_dst(&fl6, np->opt, &final);
+ opt = rcu_dereference_protected(np->opt, sock_owned_by_user(sk));
+ final_p = fl6_update_dst(&fl6, opt, &final);
security_sk_classify_flow(sk, flowi6_to_flowi(&fl6));
@@ -255,7 +255,7 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
inet->inet_rcv_saddr = LOOPBACK4_IPV6;
sk->sk_gso_type = SKB_GSO_TCPV6;
- __ip6_dst_store(sk, dst, NULL, NULL);
+ ip6_dst_store(sk, dst, NULL, NULL);
if (tcp_death_row.sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp &&
@@ -263,9 +263,9 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
tcp_fetch_timewait_stamp(sk, dst);
icsk->icsk_ext_hdr_len = 0;
- if (np->opt)
- icsk->icsk_ext_hdr_len = (np->opt->opt_flen +
- np->opt->opt_nflen);
+ if (opt)
+ icsk->icsk_ext_hdr_len = opt->opt_flen +
+ opt->opt_nflen;
tp->rx_opt.mss_clamp = IPV6_MIN_MTU - sizeof(struct tcphdr) - sizeof(struct ipv6hdr);
@@ -327,6 +327,7 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
struct tcp_sock *tp;
__u32 seq, snd_una;
struct sock *sk;
+ bool fatal;
int err;
sk = __inet6_lookup_established(net, &tcp_hashinfo,
@@ -345,8 +346,9 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
return;
}
seq = ntohl(th->seq);
+ fatal = icmpv6_err_convert(type, code, &err);
if (sk->sk_state == TCP_NEW_SYN_RECV)
- return tcp_req_err(sk, seq);
+ return tcp_req_err(sk, seq, fatal);
bh_lock_sock(sk);
if (sock_owned_by_user(sk) && type != ICMPV6_PKT_TOOBIG)
@@ -400,7 +402,6 @@ static void tcp_v6_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
goto out;
}
- icmpv6_err_convert(type, code, &err);
/* Might be for an request_sock */
switch (sk->sk_state) {
@@ -461,7 +462,10 @@ static int tcp_v6_send_synack(const struct sock *sk, struct dst_entry *dst,
if (np->repflow && ireq->pktopts)
fl6->flowlabel = ip6_flowlabel(ipv6_hdr(ireq->pktopts));
- err = ip6_xmit(sk, skb, fl6, np->opt, np->tclass);
+ rcu_read_lock();
+ err = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt),
+ np->tclass);
+ rcu_read_unlock();
err = net_xmit_eval(err);
}
@@ -537,7 +541,8 @@ static int tcp_v6_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
bp->len = cpu_to_be32(nbytes);
sg_init_one(&sg, bp, sizeof(*bp));
- return crypto_hash_update(&hp->md5_desc, &sg, sizeof(*bp));
+ ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
+ return crypto_ahash_update(hp->md5_req);
}
static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
@@ -545,14 +550,14 @@ static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
const struct tcphdr *th)
{
struct tcp_md5sig_pool *hp;
- struct hash_desc *desc;
+ struct ahash_request *req;
hp = tcp_get_md5sig_pool();
if (!hp)
goto clear_hash_noput;
- desc = &hp->md5_desc;
+ req = hp->md5_req;
- if (crypto_hash_init(desc))
+ if (crypto_ahash_init(req))
goto clear_hash;
if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
goto clear_hash;
@@ -560,7 +565,8 @@ static int tcp_v6_md5_hash_hdr(char *md5_hash, struct tcp_md5sig_key *key,
goto clear_hash;
if (tcp_md5_hash_key(hp, key))
goto clear_hash;
- if (crypto_hash_final(desc, md5_hash))
+ ahash_request_set_crypt(req, NULL, md5_hash, 0);
+ if (crypto_ahash_final(req))
goto clear_hash;
tcp_put_md5sig_pool();
@@ -580,7 +586,7 @@ static int tcp_v6_md5_hash_skb(char *md5_hash,
{
const struct in6_addr *saddr, *daddr;
struct tcp_md5sig_pool *hp;
- struct hash_desc *desc;
+ struct ahash_request *req;
const struct tcphdr *th = tcp_hdr(skb);
if (sk) { /* valid for establish/request sockets */
@@ -595,9 +601,9 @@ static int tcp_v6_md5_hash_skb(char *md5_hash,
hp = tcp_get_md5sig_pool();
if (!hp)
goto clear_hash_noput;
- desc = &hp->md5_desc;
+ req = hp->md5_req;
- if (crypto_hash_init(desc))
+ if (crypto_ahash_init(req))
goto clear_hash;
if (tcp_v6_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
@@ -608,7 +614,8 @@ static int tcp_v6_md5_hash_skb(char *md5_hash,
goto clear_hash;
if (tcp_md5_hash_key(hp, key))
goto clear_hash;
- if (crypto_hash_final(desc, md5_hash))
+ ahash_request_set_crypt(req, NULL, md5_hash, 0);
+ if (crypto_ahash_final(req))
goto clear_hash;
tcp_put_md5sig_pool();
@@ -852,7 +859,9 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
#ifdef CONFIG_TCP_MD5SIG
hash_location = tcp_parse_md5sig_option(th);
- if (!sk && hash_location) {
+ if (sk && sk_fullsock(sk)) {
+ key = tcp_v6_md5_do_lookup(sk, &ipv6h->saddr);
+ } else if (hash_location) {
/*
* active side is lost. Try to find listening socket through
* source port, and then find md5 key through listening socket.
@@ -861,7 +870,8 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
* no RST generated if md5 hash doesn't match.
*/
sk1 = inet6_lookup_listener(dev_net(skb_dst(skb)->dev),
- &tcp_hashinfo, &ipv6h->saddr,
+ &tcp_hashinfo, NULL, 0,
+ &ipv6h->saddr,
th->source, &ipv6h->daddr,
ntohs(th->source), tcp_v6_iif(skb));
if (!sk1)
@@ -875,8 +885,6 @@ static void tcp_v6_send_reset(const struct sock *sk, struct sk_buff *skb)
genhash = tcp_v6_md5_hash_skb(newhash, key, NULL, skb);
if (genhash || memcmp(hash_location, newhash, 16) != 0)
goto release_sk1;
- } else {
- key = sk ? tcp_v6_md5_do_lookup(sk, &ipv6h->saddr) : NULL;
}
#endif
@@ -972,6 +980,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
struct inet_request_sock *ireq;
struct ipv6_pinfo *newnp;
const struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_txoptions *opt;
struct tcp6_sock *newtcp6sk;
struct inet_sock *newinet;
struct tcp_sock *newtp;
@@ -1056,7 +1065,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
*/
newsk->sk_gso_type = SKB_GSO_TCPV6;
- __ip6_dst_store(newsk, dst, NULL, NULL);
+ ip6_dst_store(newsk, dst, NULL, NULL);
inet6_sk_rx_dst_set(newsk, skb);
newtcp6sk = (struct tcp6_sock *)newsk;
@@ -1098,13 +1107,15 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
but we make one more one thing there: reattach optmem
to newsk.
*/
- if (np->opt)
- newnp->opt = ipv6_dup_options(newsk, np->opt);
-
+ opt = rcu_dereference(np->opt);
+ if (opt) {
+ opt = ipv6_dup_options(newsk, opt);
+ RCU_INIT_POINTER(newnp->opt, opt);
+ }
inet_csk(newsk)->icsk_ext_hdr_len = 0;
- if (newnp->opt)
- inet_csk(newsk)->icsk_ext_hdr_len = (newnp->opt->opt_nflen +
- newnp->opt->opt_flen);
+ if (opt)
+ inet_csk(newsk)->icsk_ext_hdr_len = opt->opt_nflen +
+ opt->opt_flen;
tcp_ca_openreq_child(newsk, dst);
@@ -1130,7 +1141,7 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
*/
tcp_md5_do_add(newsk, (union tcp_md5_addr *)&newsk->sk_v6_daddr,
AF_INET6, key->key, key->keylen,
- sk_gfp_atomic(sk, GFP_ATOMIC));
+ sk_gfp_mask(sk, GFP_ATOMIC));
}
#endif
@@ -1140,14 +1151,18 @@ static struct sock *tcp_v6_syn_recv_sock(const struct sock *sk, struct sk_buff *
goto out;
}
*own_req = inet_ehash_nolisten(newsk, req_to_sk(req_unhash));
- /* Clone pktoptions received with SYN, if we own the req */
- if (*own_req && ireq->pktopts) {
- newnp->pktoptions = skb_clone(ireq->pktopts,
- sk_gfp_atomic(sk, GFP_ATOMIC));
- consume_skb(ireq->pktopts);
- ireq->pktopts = NULL;
- if (newnp->pktoptions)
- skb_set_owner_r(newnp->pktoptions, newsk);
+ if (*own_req) {
+ tcp_move_syn(newtp, req);
+
+ /* Clone pktoptions received with SYN, if we own the req */
+ if (ireq->pktopts) {
+ newnp->pktoptions = skb_clone(ireq->pktopts,
+ sk_gfp_mask(sk, GFP_ATOMIC));
+ consume_skb(ireq->pktopts);
+ ireq->pktopts = NULL;
+ if (newnp->pktoptions)
+ skb_set_owner_r(newnp->pktoptions, newsk);
+ }
}
return newsk;
@@ -1208,7 +1223,7 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
--ANK (980728)
*/
if (np->rxopt.all)
- opt_skb = skb_clone(skb, sk_gfp_atomic(sk, GFP_ATOMIC));
+ opt_skb = skb_clone(skb, sk_gfp_mask(sk, GFP_ATOMIC));
if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */
struct dst_entry *dst = sk->sk_rx_dst;
@@ -1365,8 +1380,8 @@ static int tcp_v6_rcv(struct sk_buff *skb)
hdr = ipv6_hdr(skb);
lookup:
- sk = __inet6_lookup_skb(&tcp_hashinfo, skb, th->source, th->dest,
- inet6_iif(skb));
+ sk = __inet6_lookup_skb(&tcp_hashinfo, skb, __tcp_hdrlen(th),
+ th->source, th->dest, inet6_iif(skb));
if (!sk)
goto no_tcp_socket;
@@ -1376,7 +1391,7 @@ process:
if (sk->sk_state == TCP_NEW_SYN_RECV) {
struct request_sock *req = inet_reqsk(sk);
- struct sock *nsk = NULL;
+ struct sock *nsk;
sk = req->rsk_listener;
tcp_v6_fill_cb(skb, hdr, th);
@@ -1384,24 +1399,24 @@ process:
reqsk_put(req);
goto discard_it;
}
- if (likely(sk->sk_state == TCP_LISTEN)) {
- nsk = tcp_check_req(sk, skb, req, false);
- } else {
+ if (unlikely(sk->sk_state != TCP_LISTEN)) {
inet_csk_reqsk_queue_drop_and_put(sk, req);
goto lookup;
}
+ sock_hold(sk);
+ nsk = tcp_check_req(sk, skb, req, false);
if (!nsk) {
reqsk_put(req);
- goto discard_it;
+ goto discard_and_relse;
}
if (nsk == sk) {
- sock_hold(sk);
reqsk_put(req);
tcp_v6_restore_cb(skb);
} else if (tcp_child_process(sk, nsk, skb)) {
tcp_v6_send_reset(nsk, skb);
- goto discard_it;
+ goto discard_and_relse;
} else {
+ sock_put(sk);
return 0;
}
}
@@ -1431,7 +1446,7 @@ process:
sk_incoming_cpu_update(sk);
bh_lock_sock_nested(sk);
- tcp_sk(sk)->segs_in += max_t(u16, 1, skb_shinfo(skb)->gso_segs);
+ tcp_segs_in(tcp_sk(sk), skb);
ret = 0;
if (!sock_owned_by_user(sk)) {
if (!tcp_prequeue(sk, skb))
@@ -1490,6 +1505,7 @@ do_time_wait:
struct sock *sk2;
sk2 = inet6_lookup_listener(dev_net(skb->dev), &tcp_hashinfo,
+ skb, __tcp_hdrlen(th),
&ipv6_hdr(skb)->saddr, th->source,
&ipv6_hdr(skb)->daddr,
ntohs(th->dest), tcp_v6_iif(skb));
@@ -1507,7 +1523,9 @@ do_time_wait:
break;
case TCP_TW_RST:
tcp_v6_restore_cb(skb);
- goto no_tcp_socket;
+ tcp_v6_send_reset(sk, skb);
+ inet_twsk_deschedule_put(inet_twsk(sk));
+ goto discard_it;
case TCP_TW_SUCCESS:
;
}
@@ -1686,6 +1704,8 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
const struct tcp_sock *tp = tcp_sk(sp);
const struct inet_connection_sock *icsk = inet_csk(sp);
const struct fastopen_queue *fastopenq = &icsk->icsk_accept_queue.fastopenq;
+ int rx_queue;
+ int state;
dest = &sp->sk_v6_daddr;
src = &sp->sk_v6_rcv_saddr;
@@ -1706,6 +1726,15 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
timer_expires = jiffies;
}
+ state = sk_state_load(sp);
+ if (state == TCP_LISTEN)
+ rx_queue = sp->sk_ack_backlog;
+ else
+ /* Because we don't lock the socket,
+ * we might find a transient negative value.
+ */
+ rx_queue = max_t(int, tp->rcv_nxt - tp->copied_seq, 0);
+
seq_printf(seq,
"%4d: %08X%08X%08X%08X:%04X %08X%08X%08X%08X:%04X "
"%02X %08X:%08X %02X:%08lX %08X %5u %8d %lu %d %pK %lu %lu %u %u %d\n",
@@ -1714,9 +1743,9 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
src->s6_addr32[2], src->s6_addr32[3], srcp,
dest->s6_addr32[0], dest->s6_addr32[1],
dest->s6_addr32[2], dest->s6_addr32[3], destp,
- sp->sk_state,
- tp->write_seq-tp->snd_una,
- (sp->sk_state == TCP_LISTEN) ? sp->sk_ack_backlog : (tp->rcv_nxt - tp->copied_seq),
+ state,
+ tp->write_seq - tp->snd_una,
+ rx_queue,
timer_active,
jiffies_delta_to_clock_t(timer_expires - jiffies),
icsk->icsk_retransmits,
@@ -1728,7 +1757,7 @@ static void get_tcp6_sock(struct seq_file *seq, struct sock *sp, int i)
jiffies_to_clock_t(icsk->icsk_ack.ato),
(icsk->icsk_ack.quick << 1) | icsk->icsk_ack.pingpong,
tp->snd_cwnd,
- sp->sk_state == TCP_LISTEN ?
+ state == TCP_LISTEN ?
fastopenq->max_qlen :
(tcp_in_initial_slowstart(tp) ? -1 : tp->snd_ssthresh)
);
@@ -1842,7 +1871,7 @@ struct proto tcpv6_prot = {
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v6_do_rcv,
.release_cb = tcp_release_cb,
- .hash = inet_hash,
+ .hash = inet6_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
@@ -1865,10 +1894,8 @@ struct proto tcpv6_prot = {
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
-#ifdef CONFIG_MEMCG_KMEM
- .proto_cgroup = tcp_proto_cgroup,
-#endif
.clear_sk = tcp_v6_clear_sk,
+ .diag_destroy = tcp_abort,
};
static const struct inet6_protocol tcpv6_protocol = {
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 01bcb49619ee..8125931106be 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -37,6 +37,7 @@
#include <linux/slab.h>
#include <asm/uaccess.h>
+#include <net/addrconf.h>
#include <net/ndisc.h>
#include <net/protocol.h>
#include <net/transp_v6.h>
@@ -47,6 +48,7 @@
#include <net/xfrm.h>
#include <net/inet6_hashtables.h>
#include <net/busy_poll.h>
+#include <net/sock_reuseport.h>
#include <linux/proc_fs.h>
#include <linux/seq_file.h>
@@ -76,34 +78,6 @@ static u32 udp6_ehashfn(const struct net *net,
udp_ipv6_hash_secret + net_hash_mix(net));
}
-int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2)
-{
- const struct in6_addr *sk2_rcv_saddr6 = inet6_rcv_saddr(sk2);
- int sk2_ipv6only = inet_v6_ipv6only(sk2);
- int addr_type = ipv6_addr_type(&sk->sk_v6_rcv_saddr);
- int addr_type2 = sk2_rcv_saddr6 ? ipv6_addr_type(sk2_rcv_saddr6) : IPV6_ADDR_MAPPED;
-
- /* if both are mapped, treat as IPv4 */
- if (addr_type == IPV6_ADDR_MAPPED && addr_type2 == IPV6_ADDR_MAPPED)
- return (!sk2_ipv6only &&
- (!sk->sk_rcv_saddr || !sk2->sk_rcv_saddr ||
- sk->sk_rcv_saddr == sk2->sk_rcv_saddr));
-
- if (addr_type2 == IPV6_ADDR_ANY &&
- !(sk2_ipv6only && addr_type == IPV6_ADDR_MAPPED))
- return 1;
-
- if (addr_type == IPV6_ADDR_ANY &&
- !(ipv6_only_sock(sk) && addr_type2 == IPV6_ADDR_MAPPED))
- return 1;
-
- if (sk2_rcv_saddr6 &&
- ipv6_addr_equal(&sk->sk_v6_rcv_saddr, sk2_rcv_saddr6))
- return 1;
-
- return 0;
-}
-
static u32 udp6_portaddr_hash(const struct net *net,
const struct in6_addr *addr6,
unsigned int port)
@@ -235,11 +209,13 @@ static inline int compute_score2(struct sock *sk, struct net *net,
static struct sock *udp6_lib_lookup2(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, unsigned int hnum, int dif,
- struct udp_hslot *hslot2, unsigned int slot2)
+ struct udp_hslot *hslot2, unsigned int slot2,
+ struct sk_buff *skb)
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
int score, badness, matches = 0, reuseport = 0;
+ bool select_ok = true;
u32 hash = 0;
begin:
@@ -255,6 +231,17 @@ begin:
if (reuseport) {
hash = udp6_ehashfn(net, daddr, hnum,
saddr, sport);
+ if (select_ok) {
+ struct sock *sk2;
+
+ sk2 = reuseport_select_sock(sk, hash, skb,
+ sizeof(struct udphdr));
+ if (sk2) {
+ result = sk2;
+ select_ok = false;
+ goto found;
+ }
+ }
matches = 1;
}
} else if (score == badness && reuseport) {
@@ -273,6 +260,7 @@ begin:
goto begin;
if (result) {
+found:
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
result = NULL;
else if (unlikely(compute_score2(result, net, saddr, sport,
@@ -287,7 +275,8 @@ begin:
struct sock *__udp6_lib_lookup(struct net *net,
const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport,
- int dif, struct udp_table *udptable)
+ int dif, struct udp_table *udptable,
+ struct sk_buff *skb)
{
struct sock *sk, *result;
struct hlist_nulls_node *node;
@@ -295,6 +284,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
unsigned int hash2, slot2, slot = udp_hashfn(net, hnum, udptable->mask);
struct udp_hslot *hslot2, *hslot = &udptable->hash[slot];
int score, badness, matches = 0, reuseport = 0;
+ bool select_ok = true;
u32 hash = 0;
rcu_read_lock();
@@ -307,7 +297,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
result = udp6_lib_lookup2(net, saddr, sport,
daddr, hnum, dif,
- hslot2, slot2);
+ hslot2, slot2, skb);
if (!result) {
hash2 = udp6_portaddr_hash(net, &in6addr_any, hnum);
slot2 = hash2 & udptable->mask;
@@ -317,7 +307,7 @@ struct sock *__udp6_lib_lookup(struct net *net,
result = udp6_lib_lookup2(net, saddr, sport,
&in6addr_any, hnum, dif,
- hslot2, slot2);
+ hslot2, slot2, skb);
}
rcu_read_unlock();
return result;
@@ -334,6 +324,17 @@ begin:
if (reuseport) {
hash = udp6_ehashfn(net, daddr, hnum,
saddr, sport);
+ if (select_ok) {
+ struct sock *sk2;
+
+ sk2 = reuseport_select_sock(sk, hash, skb,
+ sizeof(struct udphdr));
+ if (sk2) {
+ result = sk2;
+ select_ok = false;
+ goto found;
+ }
+ }
matches = 1;
}
} else if (score == badness && reuseport) {
@@ -352,6 +353,7 @@ begin:
goto begin;
if (result) {
+found:
if (unlikely(!atomic_inc_not_zero_hint(&result->sk_refcnt, 2)))
result = NULL;
else if (unlikely(compute_score(result, net, hnum, saddr, sport,
@@ -377,13 +379,13 @@ static struct sock *__udp6_lib_lookup_skb(struct sk_buff *skb,
return sk;
return __udp6_lib_lookup(dev_net(skb_dst(skb)->dev), &iph->saddr, sport,
&iph->daddr, dport, inet6_iif(skb),
- udptable);
+ udptable, skb);
}
struct sock *udp6_lib_lookup(struct net *net, const struct in6_addr *saddr, __be16 sport,
const struct in6_addr *daddr, __be16 dport, int dif)
{
- return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table);
+ return __udp6_lib_lookup(net, saddr, sport, daddr, dport, dif, &udp_table, NULL);
}
EXPORT_SYMBOL_GPL(udp6_lib_lookup);
@@ -402,6 +404,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, size_t len,
int peeked, off = 0;
int err;
int is_udplite = IS_UDPLITE(sk);
+ bool checksum_valid = false;
int is_udp4;
bool slow;
@@ -433,11 +436,12 @@ try_again:
*/
if (copied < ulen || UDP_SKB_CB(skb)->partial_cov) {
- if (udp_lib_checksum_complete(skb))
+ checksum_valid = !udp_lib_checksum_complete(skb);
+ if (!checksum_valid)
goto csum_copy_err;
}
- if (skb_csum_unnecessary(skb))
+ if (checksum_valid || skb_csum_unnecessary(skb))
err = skb_copy_datagram_msg(skb, sizeof(struct udphdr),
msg, copied);
else {
@@ -544,37 +548,39 @@ void __udp6_lib_err(struct sk_buff *skb, struct inet6_skb_parm *opt,
const struct in6_addr *daddr = &hdr->daddr;
struct udphdr *uh = (struct udphdr *)(skb->data+offset);
struct sock *sk;
+ int harderr;
int err;
struct net *net = dev_net(skb->dev);
- sk = __udp6_lib_lookup(net, daddr, uh->dest,
- saddr, uh->source, inet6_iif(skb), udptable);
+ sk = __udp6_lib_lookup(net, daddr, uh->dest, saddr, uh->source,
+ inet6_iif(skb), udptable, skb);
if (!sk) {
ICMP6_INC_STATS_BH(net, __in6_dev_get(skb->dev),
ICMP6_MIB_INERRORS);
return;
}
+ harderr = icmpv6_err_convert(type, code, &err);
+ np = inet6_sk(sk);
+
if (type == ICMPV6_PKT_TOOBIG) {
if (!ip6_sk_accept_pmtu(sk))
goto out;
ip6_sk_update_pmtu(skb, sk, info);
+ if (np->pmtudisc != IPV6_PMTUDISC_DONT)
+ harderr = 1;
}
if (type == NDISC_REDIRECT) {
ip6_sk_redirect(skb, sk);
goto out;
}
- np = inet6_sk(sk);
-
- if (!icmpv6_err_convert(type, code, &err) && !np->recverr)
- goto out;
-
- if (sk->sk_state != TCP_ESTABLISHED && !np->recverr)
- goto out;
-
- if (np->recverr)
+ if (!np->recverr) {
+ if (!harderr || sk->sk_state != TCP_ESTABLISHED)
+ goto out;
+ } else {
ipv6_icmp_error(sk, skb, err, uh->dest, ntohl(info), (u8 *)(uh+1));
+ }
sk->sk_err = err;
sk->sk_error_report(sk);
@@ -837,8 +843,8 @@ start_lookup:
flush_stack(stack, count, skb, count - 1);
} else {
if (!inner_flushed)
- UDP_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
- proto == IPPROTO_UDPLITE);
+ UDP6_INC_STATS_BH(net, UDP_MIB_IGNOREDMULTI,
+ proto == IPPROTO_UDPLITE);
consume_skb(skb);
}
return 0;
@@ -916,11 +922,9 @@ int __udp6_lib_rcv(struct sk_buff *skb, struct udp_table *udptable,
ret = udpv6_queue_rcv_skb(sk, skb);
sock_put(sk);
- /* a return value > 0 means to resubmit the input, but
- * it wants the return to be -protocol, or 0
- */
+ /* a return value > 0 means to resubmit the input */
if (ret > 0)
- return -ret;
+ return ret;
return 0;
}
@@ -1110,6 +1114,7 @@ int udpv6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
DECLARE_SOCKADDR(struct sockaddr_in6 *, sin6, msg->msg_name);
struct in6_addr *daddr, *final_p, final;
struct ipv6_txoptions *opt = NULL;
+ struct ipv6_txoptions *opt_to_free = NULL;
struct ip6_flowlabel *flowlabel = NULL;
struct flowi6 fl6;
struct dst_entry *dst;
@@ -1263,8 +1268,10 @@ do_udp_sendmsg:
opt = NULL;
connected = 0;
}
- if (!opt)
- opt = np->opt;
+ if (!opt) {
+ opt = txopt_get(np);
+ opt_to_free = opt;
+ }
if (flowlabel)
opt = fl6_merge_options(&opt_space, flowlabel, opt);
opt = ipv6_fixup_options(&opt_space, opt);
@@ -1373,6 +1380,7 @@ release_dst:
out:
dst_release(dst);
fl6_sock_release(flowlabel);
+ txopt_put(opt_to_free);
if (!err)
return len;
/*
diff --git a/net/ipv6/udp_offload.c b/net/ipv6/udp_offload.c
index 7441e1e63893..2b0fbe6929e8 100644
--- a/net/ipv6/udp_offload.c
+++ b/net/ipv6/udp_offload.c
@@ -81,12 +81,18 @@ static struct sk_buff *udp6_ufo_fragment(struct sk_buff *skb,
csum = skb_checksum(skb, 0, skb->len, 0);
uh->check = udp_v6_check(skb->len, &ipv6h->saddr,
&ipv6h->daddr, csum);
-
if (uh->check == 0)
uh->check = CSUM_MANGLED_0;
skb->ip_summed = CHECKSUM_NONE;
+ /* If there is no outer header we can fake a checksum offload
+ * due to the fact that we have already done the checksum in
+ * software prior to segmenting the frame.
+ */
+ if (!skb->encap_hdr_csum)
+ features |= NETIF_F_HW_CSUM;
+
/* Check if there is enough headroom to insert fragment header. */
tnl_hlen = skb_tnl_header_len(skb);
if (skb->mac_header < (tnl_hlen + frag_hdr_sz)) {
diff --git a/net/ipv6/xfrm6_mode_tunnel.c b/net/ipv6/xfrm6_mode_tunnel.c
index f7fbdbabe50e..372855eeaf42 100644
--- a/net/ipv6/xfrm6_mode_tunnel.c
+++ b/net/ipv6/xfrm6_mode_tunnel.c
@@ -23,7 +23,7 @@ static inline void ipip6_ecn_decapsulate(struct sk_buff *skb)
struct ipv6hdr *inner_iph = ipipv6_hdr(skb);
if (INET_ECN_is_ce(XFRM_MODE_SKB_CB(skb)->tos))
- IP6_ECN_set_ce(inner_iph);
+ IP6_ECN_set_ce(skb, inner_iph);
}
/* Add encapsulation header.
diff --git a/net/ipv6/xfrm6_policy.c b/net/ipv6/xfrm6_policy.c
index 5643423fe67a..c074771a10f7 100644
--- a/net/ipv6/xfrm6_policy.c
+++ b/net/ipv6/xfrm6_policy.c
@@ -279,7 +279,7 @@ static void xfrm6_dst_ifdown(struct dst_entry *dst, struct net_device *dev,
xfrm_dst_ifdown(dst, dev);
}
-static struct dst_ops xfrm6_dst_ops = {
+static struct dst_ops xfrm6_dst_ops_template = {
.family = AF_INET6,
.gc = xfrm6_garbage_collect,
.update_pmtu = xfrm6_update_pmtu,
@@ -293,7 +293,7 @@ static struct dst_ops xfrm6_dst_ops = {
static struct xfrm_policy_afinfo xfrm6_policy_afinfo = {
.family = AF_INET6,
- .dst_ops = &xfrm6_dst_ops,
+ .dst_ops = &xfrm6_dst_ops_template,
.dst_lookup = xfrm6_dst_lookup,
.get_saddr = xfrm6_get_saddr,
.decode_session = _decode_session6,
@@ -325,7 +325,7 @@ static struct ctl_table xfrm6_policy_table[] = {
{ }
};
-static int __net_init xfrm6_net_init(struct net *net)
+static int __net_init xfrm6_net_sysctl_init(struct net *net)
{
struct ctl_table *table;
struct ctl_table_header *hdr;
@@ -353,7 +353,7 @@ err_alloc:
return -ENOMEM;
}
-static void __net_exit xfrm6_net_exit(struct net *net)
+static void __net_exit xfrm6_net_sysctl_exit(struct net *net)
{
struct ctl_table *table;
@@ -365,24 +365,52 @@ static void __net_exit xfrm6_net_exit(struct net *net)
if (!net_eq(net, &init_net))
kfree(table);
}
+#else /* CONFIG_SYSCTL */
+static int inline xfrm6_net_sysctl_init(struct net *net)
+{
+ return 0;
+}
+
+static void inline xfrm6_net_sysctl_exit(struct net *net)
+{
+}
+#endif
+
+static int __net_init xfrm6_net_init(struct net *net)
+{
+ int ret;
+
+ memcpy(&net->xfrm.xfrm6_dst_ops, &xfrm6_dst_ops_template,
+ sizeof(xfrm6_dst_ops_template));
+ ret = dst_entries_init(&net->xfrm.xfrm6_dst_ops);
+ if (ret)
+ return ret;
+
+ ret = xfrm6_net_sysctl_init(net);
+ if (ret)
+ dst_entries_destroy(&net->xfrm.xfrm6_dst_ops);
+
+ return ret;
+}
+
+static void __net_exit xfrm6_net_exit(struct net *net)
+{
+ xfrm6_net_sysctl_exit(net);
+ dst_entries_destroy(&net->xfrm.xfrm6_dst_ops);
+}
static struct pernet_operations xfrm6_net_ops = {
.init = xfrm6_net_init,
.exit = xfrm6_net_exit,
};
-#endif
int __init xfrm6_init(void)
{
int ret;
- dst_entries_init(&xfrm6_dst_ops);
-
ret = xfrm6_policy_init();
- if (ret) {
- dst_entries_destroy(&xfrm6_dst_ops);
+ if (ret)
goto out;
- }
ret = xfrm6_state_init();
if (ret)
goto out_policy;
@@ -391,9 +419,7 @@ int __init xfrm6_init(void)
if (ret)
goto out_state;
-#ifdef CONFIG_SYSCTL
register_pernet_subsys(&xfrm6_net_ops);
-#endif
out:
return ret;
out_state:
@@ -405,11 +431,8 @@ out_policy:
void xfrm6_fini(void)
{
-#ifdef CONFIG_SYSCTL
unregister_pernet_subsys(&xfrm6_net_ops);
-#endif
xfrm6_protocol_fini();
xfrm6_policy_fini();
xfrm6_state_fini();
- dst_entries_destroy(&xfrm6_dst_ops);
}
diff --git a/net/irda/af_irda.c b/net/irda/af_irda.c
index e6aa48b5395c..923abd6b3064 100644
--- a/net/irda/af_irda.c
+++ b/net/irda/af_irda.c
@@ -1086,6 +1086,9 @@ static int irda_create(struct net *net, struct socket *sock, int protocol,
struct sock *sk;
struct irda_sock *self;
+ if (protocol < 0 || protocol > SK_PROTOCOL_MAX)
+ return -EINVAL;
+
if (net != &init_net)
return -EAFNOSUPPORT;
diff --git a/net/irda/ircomm/ircomm_param.c b/net/irda/ircomm/ircomm_param.c
index 3c4caa60c926..5728e76ca6d5 100644
--- a/net/irda/ircomm/ircomm_param.c
+++ b/net/irda/ircomm/ircomm_param.c
@@ -134,11 +134,10 @@ int ircomm_param_request(struct ircomm_tty_cb *self, __u8 pi, int flush)
return -1;
}
skb_put(skb, count);
+ pr_debug("%s(), skb->len=%d\n", __func__, skb->len);
spin_unlock_irqrestore(&self->spinlock, flags);
- pr_debug("%s(), skb->len=%d\n", __func__ , skb->len);
-
if (flush) {
/* ircomm_tty_do_softint will take care of the rest */
schedule_work(&self->tqueue);
diff --git a/net/irda/ircomm/ircomm_tty.c b/net/irda/ircomm/ircomm_tty.c
index a4237707f79d..da126ee6d218 100644
--- a/net/irda/ircomm/ircomm_tty.c
+++ b/net/irda/ircomm/ircomm_tty.c
@@ -287,14 +287,14 @@ static int ircomm_tty_block_til_ready(struct ircomm_tty_cb *self,
if (filp->f_flags & O_NONBLOCK) {
/* nonblock mode is set */
- if (tty->termios.c_cflag & CBAUD)
+ if (C_BAUD(tty))
tty_port_raise_dtr_rts(port);
port->flags |= ASYNC_NORMAL_ACTIVE;
pr_debug("%s(), O_NONBLOCK requested!\n", __func__);
return 0;
}
- if (tty->termios.c_cflag & CLOCAL) {
+ if (C_CLOCAL(tty)) {
pr_debug("%s(), doing CLOCAL!\n", __func__);
do_clocal = 1;
}
@@ -806,7 +806,7 @@ static void ircomm_tty_throttle(struct tty_struct *tty)
ircomm_tty_send_xchar(tty, STOP_CHAR(tty));
/* Hardware flow control? */
- if (tty->termios.c_cflag & CRTSCTS) {
+ if (C_CRTSCTS(tty)) {
self->settings.dte &= ~IRCOMM_RTS;
self->settings.dte |= IRCOMM_DELTA_RTS;
@@ -831,12 +831,11 @@ static void ircomm_tty_unthrottle(struct tty_struct *tty)
IRDA_ASSERT(self->magic == IRCOMM_TTY_MAGIC, return;);
/* Using software flow control? */
- if (I_IXOFF(tty)) {
+ if (I_IXOFF(tty))
ircomm_tty_send_xchar(tty, START_CHAR(tty));
- }
/* Using hardware flow control? */
- if (tty->termios.c_cflag & CRTSCTS) {
+ if (C_CRTSCTS(tty)) {
self->settings.dte |= (IRCOMM_RTS|IRCOMM_DELTA_RTS);
ircomm_param_request(self, IRCOMM_DTE, TRUE);
@@ -1268,10 +1267,6 @@ static void ircomm_tty_line_info(struct ircomm_tty_cb *self, struct seq_file *m)
seq_printf(m, "%cASYNC_LOW_LATENCY", sep);
sep = '|';
}
- if (self->port.flags & ASYNC_CLOSING) {
- seq_printf(m, "%cASYNC_CLOSING", sep);
- sep = '|';
- }
if (self->port.flags & ASYNC_NORMAL_ACTIVE) {
seq_printf(m, "%cASYNC_NORMAL_ACTIVE", sep);
sep = '|';
diff --git a/net/irda/ircomm/ircomm_tty_ioctl.c b/net/irda/ircomm/ircomm_tty_ioctl.c
index 75ccdbd0728e..d3687aaa23de 100644
--- a/net/irda/ircomm/ircomm_tty_ioctl.c
+++ b/net/irda/ircomm/ircomm_tty_ioctl.c
@@ -158,26 +158,21 @@ void ircomm_tty_set_termios(struct tty_struct *tty,
ircomm_tty_change_speed(self, tty);
/* Handle transition to B0 status */
- if ((old_termios->c_cflag & CBAUD) &&
- !(cflag & CBAUD)) {
+ if ((old_termios->c_cflag & CBAUD) && !(cflag & CBAUD)) {
self->settings.dte &= ~(IRCOMM_DTR|IRCOMM_RTS);
ircomm_param_request(self, IRCOMM_DTE, TRUE);
}
/* Handle transition away from B0 status */
- if (!(old_termios->c_cflag & CBAUD) &&
- (cflag & CBAUD)) {
+ if (!(old_termios->c_cflag & CBAUD) && (cflag & CBAUD)) {
self->settings.dte |= IRCOMM_DTR;
- if (!(tty->termios.c_cflag & CRTSCTS) ||
- !test_bit(TTY_THROTTLED, &tty->flags)) {
+ if (!C_CRTSCTS(tty) || !test_bit(TTY_THROTTLED, &tty->flags))
self->settings.dte |= IRCOMM_RTS;
- }
ircomm_param_request(self, IRCOMM_DTE, TRUE);
}
/* Handle turning off CRTSCTS */
- if ((old_termios->c_cflag & CRTSCTS) &&
- !(tty->termios.c_cflag & CRTSCTS))
+ if ((old_termios->c_cflag & CRTSCTS) && !C_CRTSCTS(tty))
{
tty->hw_stopped = 0;
ircomm_tty_start(tty);
diff --git a/net/iucv/af_iucv.c b/net/iucv/af_iucv.c
index fcb2752419c6..fc3598a922b0 100644
--- a/net/iucv/af_iucv.c
+++ b/net/iucv/af_iucv.c
@@ -303,7 +303,7 @@ static void iucv_sock_wake_msglim(struct sock *sk)
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible_all(&wq->wait);
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
rcu_read_unlock();
@@ -708,6 +708,9 @@ static int iucv_sock_bind(struct socket *sock, struct sockaddr *addr,
if (!addr || addr->sa_family != AF_IUCV)
return -EINVAL;
+ if (addr_len < sizeof(struct sockaddr_iucv))
+ return -EINVAL;
+
lock_sock(sk);
if (sk->sk_state != IUCV_OPEN) {
err = -EBADFD;
@@ -1031,7 +1034,7 @@ static int iucv_sock_sendmsg(struct socket *sock, struct msghdr *msg,
struct sock *sk = sock->sk;
struct iucv_sock *iucv = iucv_sk(sk);
struct sk_buff *skb;
- struct iucv_message txmsg;
+ struct iucv_message txmsg = {0};
struct cmsghdr *cmsg;
int cmsg_done;
long timeo;
@@ -1483,7 +1486,7 @@ unsigned int iucv_sock_poll(struct file *file, struct socket *sock,
if (sock_writeable(sk) && iucv_below_msglim(sk))
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
else
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
return mask;
}
@@ -2084,11 +2087,7 @@ static int afiucv_hs_callback_rx(struct sock *sk, struct sk_buff *skb)
return NET_RX_SUCCESS;
}
- /* write stuff from iucv_msg to skb cb */
- if (skb->len < sizeof(struct af_iucv_trans_hdr)) {
- kfree_skb(skb);
- return NET_RX_SUCCESS;
- }
+ /* write stuff from iucv_msg to skb cb */
skb_pull(skb, sizeof(struct af_iucv_trans_hdr));
skb_reset_transport_header(skb);
skb_reset_network_header(skb);
@@ -2119,6 +2118,20 @@ static int afiucv_hs_rcv(struct sk_buff *skb, struct net_device *dev,
char nullstring[8];
int err = 0;
+ if (skb->len < (ETH_HLEN + sizeof(struct af_iucv_trans_hdr))) {
+ WARN_ONCE(1, "AF_IUCV too short skb, len=%d, min=%d",
+ (int)skb->len,
+ (int)(ETH_HLEN + sizeof(struct af_iucv_trans_hdr)));
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
+ if (skb_headlen(skb) < (ETH_HLEN + sizeof(struct af_iucv_trans_hdr)))
+ if (skb_linearize(skb)) {
+ WARN_ONCE(1, "AF_IUCV skb_linearize failed, len=%d",
+ (int)skb->len);
+ kfree_skb(skb);
+ return NET_RX_SUCCESS;
+ }
skb_pull(skb, ETH_HLEN);
trans_hdr = (struct af_iucv_trans_hdr *)skb->data;
EBCASC(trans_hdr->destAppName, sizeof(trans_hdr->destAppName));
diff --git a/net/kcm/Kconfig b/net/kcm/Kconfig
new file mode 100644
index 000000000000..5db94d940ecc
--- /dev/null
+++ b/net/kcm/Kconfig
@@ -0,0 +1,10 @@
+
+config AF_KCM
+ tristate "KCM sockets"
+ depends on INET
+ select BPF_SYSCALL
+ ---help---
+ KCM (Kernel Connection Multiplexor) sockets provide a method
+ for multiplexing messages of a message based application
+ protocol over kernel connectons (e.g. TCP connections).
+
diff --git a/net/kcm/Makefile b/net/kcm/Makefile
new file mode 100644
index 000000000000..71256133e677
--- /dev/null
+++ b/net/kcm/Makefile
@@ -0,0 +1,3 @@
+obj-$(CONFIG_AF_KCM) += kcm.o
+
+kcm-y := kcmsock.o kcmproc.o
diff --git a/net/kcm/kcmproc.c b/net/kcm/kcmproc.c
new file mode 100644
index 000000000000..738008726cc6
--- /dev/null
+++ b/net/kcm/kcmproc.c
@@ -0,0 +1,426 @@
+#include <linux/in.h>
+#include <linux/inet.h>
+#include <linux/list.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/proc_fs.h>
+#include <linux/rculist.h>
+#include <linux/seq_file.h>
+#include <linux/socket.h>
+#include <net/inet_sock.h>
+#include <net/kcm.h>
+#include <net/net_namespace.h>
+#include <net/netns/generic.h>
+#include <net/tcp.h>
+
+#ifdef CONFIG_PROC_FS
+struct kcm_seq_muxinfo {
+ char *name;
+ const struct file_operations *seq_fops;
+ const struct seq_operations seq_ops;
+};
+
+static struct kcm_mux *kcm_get_first(struct seq_file *seq)
+{
+ struct net *net = seq_file_net(seq);
+ struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+ return list_first_or_null_rcu(&knet->mux_list,
+ struct kcm_mux, kcm_mux_list);
+}
+
+static struct kcm_mux *kcm_get_next(struct kcm_mux *mux)
+{
+ struct kcm_net *knet = mux->knet;
+
+ return list_next_or_null_rcu(&knet->mux_list, &mux->kcm_mux_list,
+ struct kcm_mux, kcm_mux_list);
+}
+
+static struct kcm_mux *kcm_get_idx(struct seq_file *seq, loff_t pos)
+{
+ struct net *net = seq_file_net(seq);
+ struct kcm_net *knet = net_generic(net, kcm_net_id);
+ struct kcm_mux *m;
+
+ list_for_each_entry_rcu(m, &knet->mux_list, kcm_mux_list) {
+ if (!pos)
+ return m;
+ --pos;
+ }
+ return NULL;
+}
+
+static void *kcm_seq_next(struct seq_file *seq, void *v, loff_t *pos)
+{
+ void *p;
+
+ if (v == SEQ_START_TOKEN)
+ p = kcm_get_first(seq);
+ else
+ p = kcm_get_next(v);
+ ++*pos;
+ return p;
+}
+
+static void *kcm_seq_start(struct seq_file *seq, loff_t *pos)
+ __acquires(rcu)
+{
+ rcu_read_lock();
+
+ if (!*pos)
+ return SEQ_START_TOKEN;
+ else
+ return kcm_get_idx(seq, *pos - 1);
+}
+
+static void kcm_seq_stop(struct seq_file *seq, void *v)
+ __releases(rcu)
+{
+ rcu_read_unlock();
+}
+
+struct kcm_proc_mux_state {
+ struct seq_net_private p;
+ int idx;
+};
+
+static int kcm_seq_open(struct inode *inode, struct file *file)
+{
+ struct kcm_seq_muxinfo *muxinfo = PDE_DATA(inode);
+ int err;
+
+ err = seq_open_net(inode, file, &muxinfo->seq_ops,
+ sizeof(struct kcm_proc_mux_state));
+ if (err < 0)
+ return err;
+ return err;
+}
+
+static void kcm_format_mux_header(struct seq_file *seq)
+{
+ struct net *net = seq_file_net(seq);
+ struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+ seq_printf(seq,
+ "*** KCM statistics (%d MUX) ****\n",
+ knet->count);
+
+ seq_printf(seq,
+ "%-14s %-10s %-16s %-10s %-16s %-8s %-8s %-8s %-8s %s",
+ "Object",
+ "RX-Msgs",
+ "RX-Bytes",
+ "TX-Msgs",
+ "TX-Bytes",
+ "Recv-Q",
+ "Rmem",
+ "Send-Q",
+ "Smem",
+ "Status");
+
+ /* XXX: pdsts header stuff here */
+ seq_puts(seq, "\n");
+}
+
+static void kcm_format_sock(struct kcm_sock *kcm, struct seq_file *seq,
+ int i, int *len)
+{
+ seq_printf(seq,
+ " kcm-%-7u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8s ",
+ kcm->index,
+ kcm->stats.rx_msgs,
+ kcm->stats.rx_bytes,
+ kcm->stats.tx_msgs,
+ kcm->stats.tx_bytes,
+ kcm->sk.sk_receive_queue.qlen,
+ sk_rmem_alloc_get(&kcm->sk),
+ kcm->sk.sk_write_queue.qlen,
+ "-");
+
+ if (kcm->tx_psock)
+ seq_printf(seq, "Psck-%u ", kcm->tx_psock->index);
+
+ if (kcm->tx_wait)
+ seq_puts(seq, "TxWait ");
+
+ if (kcm->tx_wait_more)
+ seq_puts(seq, "WMore ");
+
+ if (kcm->rx_wait)
+ seq_puts(seq, "RxWait ");
+
+ seq_puts(seq, "\n");
+}
+
+static void kcm_format_psock(struct kcm_psock *psock, struct seq_file *seq,
+ int i, int *len)
+{
+ seq_printf(seq,
+ " psock-%-5u %-10llu %-16llu %-10llu %-16llu %-8d %-8d %-8d %-8d ",
+ psock->index,
+ psock->stats.rx_msgs,
+ psock->stats.rx_bytes,
+ psock->stats.tx_msgs,
+ psock->stats.tx_bytes,
+ psock->sk->sk_receive_queue.qlen,
+ atomic_read(&psock->sk->sk_rmem_alloc),
+ psock->sk->sk_write_queue.qlen,
+ atomic_read(&psock->sk->sk_wmem_alloc));
+
+ if (psock->done)
+ seq_puts(seq, "Done ");
+
+ if (psock->tx_stopped)
+ seq_puts(seq, "TxStop ");
+
+ if (psock->rx_stopped)
+ seq_puts(seq, "RxStop ");
+
+ if (psock->tx_kcm)
+ seq_printf(seq, "Rsvd-%d ", psock->tx_kcm->index);
+
+ if (psock->ready_rx_msg)
+ seq_puts(seq, "RdyRx ");
+
+ seq_puts(seq, "\n");
+}
+
+static void
+kcm_format_mux(struct kcm_mux *mux, loff_t idx, struct seq_file *seq)
+{
+ int i, len;
+ struct kcm_sock *kcm;
+ struct kcm_psock *psock;
+
+ /* mux information */
+ seq_printf(seq,
+ "%-6s%-8s %-10llu %-16llu %-10llu %-16llu %-8s %-8s %-8s %-8s ",
+ "mux", "",
+ mux->stats.rx_msgs,
+ mux->stats.rx_bytes,
+ mux->stats.tx_msgs,
+ mux->stats.tx_bytes,
+ "-", "-", "-", "-");
+
+ seq_printf(seq, "KCMs: %d, Psocks %d\n",
+ mux->kcm_socks_cnt, mux->psocks_cnt);
+
+ /* kcm sock information */
+ i = 0;
+ spin_lock_bh(&mux->lock);
+ list_for_each_entry(kcm, &mux->kcm_socks, kcm_sock_list) {
+ kcm_format_sock(kcm, seq, i, &len);
+ i++;
+ }
+ i = 0;
+ list_for_each_entry(psock, &mux->psocks, psock_list) {
+ kcm_format_psock(psock, seq, i, &len);
+ i++;
+ }
+ spin_unlock_bh(&mux->lock);
+}
+
+static int kcm_seq_show(struct seq_file *seq, void *v)
+{
+ struct kcm_proc_mux_state *mux_state;
+
+ mux_state = seq->private;
+ if (v == SEQ_START_TOKEN) {
+ mux_state->idx = 0;
+ kcm_format_mux_header(seq);
+ } else {
+ kcm_format_mux(v, mux_state->idx, seq);
+ mux_state->idx++;
+ }
+ return 0;
+}
+
+static const struct file_operations kcm_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = kcm_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+};
+
+static struct kcm_seq_muxinfo kcm_seq_muxinfo = {
+ .name = "kcm",
+ .seq_fops = &kcm_seq_fops,
+ .seq_ops = {
+ .show = kcm_seq_show,
+ .start = kcm_seq_start,
+ .next = kcm_seq_next,
+ .stop = kcm_seq_stop,
+ }
+};
+
+static int kcm_proc_register(struct net *net, struct kcm_seq_muxinfo *muxinfo)
+{
+ struct proc_dir_entry *p;
+ int rc = 0;
+
+ p = proc_create_data(muxinfo->name, S_IRUGO, net->proc_net,
+ muxinfo->seq_fops, muxinfo);
+ if (!p)
+ rc = -ENOMEM;
+ return rc;
+}
+EXPORT_SYMBOL(kcm_proc_register);
+
+static void kcm_proc_unregister(struct net *net,
+ struct kcm_seq_muxinfo *muxinfo)
+{
+ remove_proc_entry(muxinfo->name, net->proc_net);
+}
+EXPORT_SYMBOL(kcm_proc_unregister);
+
+static int kcm_stats_seq_show(struct seq_file *seq, void *v)
+{
+ struct kcm_psock_stats psock_stats;
+ struct kcm_mux_stats mux_stats;
+ struct kcm_mux *mux;
+ struct kcm_psock *psock;
+ struct net *net = seq->private;
+ struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+ memset(&mux_stats, 0, sizeof(mux_stats));
+ memset(&psock_stats, 0, sizeof(psock_stats));
+
+ mutex_lock(&knet->mutex);
+
+ aggregate_mux_stats(&knet->aggregate_mux_stats, &mux_stats);
+ aggregate_psock_stats(&knet->aggregate_psock_stats,
+ &psock_stats);
+
+ list_for_each_entry_rcu(mux, &knet->mux_list, kcm_mux_list) {
+ spin_lock_bh(&mux->lock);
+ aggregate_mux_stats(&mux->stats, &mux_stats);
+ aggregate_psock_stats(&mux->aggregate_psock_stats,
+ &psock_stats);
+ list_for_each_entry(psock, &mux->psocks, psock_list)
+ aggregate_psock_stats(&psock->stats, &psock_stats);
+ spin_unlock_bh(&mux->lock);
+ }
+
+ mutex_unlock(&knet->mutex);
+
+ seq_printf(seq,
+ "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s\n",
+ "MUX",
+ "RX-Msgs",
+ "RX-Bytes",
+ "TX-Msgs",
+ "TX-Bytes",
+ "TX-Retries",
+ "Attach",
+ "Unattach",
+ "UnattchRsvd",
+ "RX-RdyDrops");
+
+ seq_printf(seq,
+ "%-8s %-10llu %-16llu %-10llu %-16llu %-10u %-10u %-10u %-10u %-10u\n",
+ "",
+ mux_stats.rx_msgs,
+ mux_stats.rx_bytes,
+ mux_stats.tx_msgs,
+ mux_stats.tx_bytes,
+ mux_stats.tx_retries,
+ mux_stats.psock_attach,
+ mux_stats.psock_unattach_rsvd,
+ mux_stats.psock_unattach,
+ mux_stats.rx_ready_drops);
+
+ seq_printf(seq,
+ "%-8s %-10s %-16s %-10s %-16s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s %-10s\n",
+ "Psock",
+ "RX-Msgs",
+ "RX-Bytes",
+ "TX-Msgs",
+ "TX-Bytes",
+ "Reserved",
+ "Unreserved",
+ "RX-Aborts",
+ "RX-MemFail",
+ "RX-NeedMor",
+ "RX-BadLen",
+ "RX-TooBig",
+ "RX-Timeout",
+ "TX-Aborts");
+
+ seq_printf(seq,
+ "%-8s %-10llu %-16llu %-10llu %-16llu %-10llu %-10llu %-10u %-10u %-10u %-10u %-10u %-10u %-10u\n",
+ "",
+ psock_stats.rx_msgs,
+ psock_stats.rx_bytes,
+ psock_stats.tx_msgs,
+ psock_stats.tx_bytes,
+ psock_stats.reserved,
+ psock_stats.unreserved,
+ psock_stats.rx_aborts,
+ psock_stats.rx_mem_fail,
+ psock_stats.rx_need_more_hdr,
+ psock_stats.rx_bad_hdr_len,
+ psock_stats.rx_msg_too_big,
+ psock_stats.rx_msg_timeouts,
+ psock_stats.tx_aborts);
+
+ return 0;
+}
+
+static int kcm_stats_seq_open(struct inode *inode, struct file *file)
+{
+ return single_open_net(inode, file, kcm_stats_seq_show);
+}
+
+static const struct file_operations kcm_stats_seq_fops = {
+ .owner = THIS_MODULE,
+ .open = kcm_stats_seq_open,
+ .read = seq_read,
+ .llseek = seq_lseek,
+ .release = single_release_net,
+};
+
+static int kcm_proc_init_net(struct net *net)
+{
+ int err;
+
+ if (!proc_create("kcm_stats", S_IRUGO, net->proc_net,
+ &kcm_stats_seq_fops)) {
+ err = -ENOMEM;
+ goto out_kcm_stats;
+ }
+
+ err = kcm_proc_register(net, &kcm_seq_muxinfo);
+ if (err)
+ goto out_kcm;
+
+ return 0;
+
+out_kcm:
+ remove_proc_entry("kcm_stats", net->proc_net);
+out_kcm_stats:
+ return err;
+}
+
+static void kcm_proc_exit_net(struct net *net)
+{
+ kcm_proc_unregister(net, &kcm_seq_muxinfo);
+ remove_proc_entry("kcm_stats", net->proc_net);
+}
+
+static struct pernet_operations kcm_net_ops = {
+ .init = kcm_proc_init_net,
+ .exit = kcm_proc_exit_net,
+};
+
+int __init kcm_proc_init(void)
+{
+ return register_pernet_subsys(&kcm_net_ops);
+}
+
+void __exit kcm_proc_exit(void)
+{
+ unregister_pernet_subsys(&kcm_net_ops);
+}
+
+#endif /* CONFIG_PROC_FS */
diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
new file mode 100644
index 000000000000..40662d73204f
--- /dev/null
+++ b/net/kcm/kcmsock.c
@@ -0,0 +1,2409 @@
+#include <linux/bpf.h>
+#include <linux/errno.h>
+#include <linux/errqueue.h>
+#include <linux/file.h>
+#include <linux/in.h>
+#include <linux/kernel.h>
+#include <linux/module.h>
+#include <linux/net.h>
+#include <linux/netdevice.h>
+#include <linux/poll.h>
+#include <linux/rculist.h>
+#include <linux/skbuff.h>
+#include <linux/socket.h>
+#include <linux/uaccess.h>
+#include <linux/workqueue.h>
+#include <net/kcm.h>
+#include <net/netns/generic.h>
+#include <net/sock.h>
+#include <net/tcp.h>
+#include <uapi/linux/kcm.h>
+
+unsigned int kcm_net_id;
+
+static struct kmem_cache *kcm_psockp __read_mostly;
+static struct kmem_cache *kcm_muxp __read_mostly;
+static struct workqueue_struct *kcm_wq;
+
+static inline struct kcm_sock *kcm_sk(const struct sock *sk)
+{
+ return (struct kcm_sock *)sk;
+}
+
+static inline struct kcm_tx_msg *kcm_tx_msg(struct sk_buff *skb)
+{
+ return (struct kcm_tx_msg *)skb->cb;
+}
+
+static inline struct kcm_rx_msg *kcm_rx_msg(struct sk_buff *skb)
+{
+ return (struct kcm_rx_msg *)((void *)skb->cb +
+ offsetof(struct qdisc_skb_cb, data));
+}
+
+static void report_csk_error(struct sock *csk, int err)
+{
+ csk->sk_err = EPIPE;
+ csk->sk_error_report(csk);
+}
+
+/* Callback lock held */
+static void kcm_abort_rx_psock(struct kcm_psock *psock, int err,
+ struct sk_buff *skb)
+{
+ struct sock *csk = psock->sk;
+
+ /* Unrecoverable error in receive */
+
+ del_timer(&psock->rx_msg_timer);
+
+ if (psock->rx_stopped)
+ return;
+
+ psock->rx_stopped = 1;
+ KCM_STATS_INCR(psock->stats.rx_aborts);
+
+ /* Report an error on the lower socket */
+ report_csk_error(csk, err);
+}
+
+static void kcm_abort_tx_psock(struct kcm_psock *psock, int err,
+ bool wakeup_kcm)
+{
+ struct sock *csk = psock->sk;
+ struct kcm_mux *mux = psock->mux;
+
+ /* Unrecoverable error in transmit */
+
+ spin_lock_bh(&mux->lock);
+
+ if (psock->tx_stopped) {
+ spin_unlock_bh(&mux->lock);
+ return;
+ }
+
+ psock->tx_stopped = 1;
+ KCM_STATS_INCR(psock->stats.tx_aborts);
+
+ if (!psock->tx_kcm) {
+ /* Take off psocks_avail list */
+ list_del(&psock->psock_avail_list);
+ } else if (wakeup_kcm) {
+ /* In this case psock is being aborted while outside of
+ * write_msgs and psock is reserved. Schedule tx_work
+ * to handle the failure there. Need to commit tx_stopped
+ * before queuing work.
+ */
+ smp_mb();
+
+ queue_work(kcm_wq, &psock->tx_kcm->tx_work);
+ }
+
+ spin_unlock_bh(&mux->lock);
+
+ /* Report error on lower socket */
+ report_csk_error(csk, err);
+}
+
+/* RX mux lock held. */
+static void kcm_update_rx_mux_stats(struct kcm_mux *mux,
+ struct kcm_psock *psock)
+{
+ KCM_STATS_ADD(mux->stats.rx_bytes,
+ psock->stats.rx_bytes - psock->saved_rx_bytes);
+ mux->stats.rx_msgs +=
+ psock->stats.rx_msgs - psock->saved_rx_msgs;
+ psock->saved_rx_msgs = psock->stats.rx_msgs;
+ psock->saved_rx_bytes = psock->stats.rx_bytes;
+}
+
+static void kcm_update_tx_mux_stats(struct kcm_mux *mux,
+ struct kcm_psock *psock)
+{
+ KCM_STATS_ADD(mux->stats.tx_bytes,
+ psock->stats.tx_bytes - psock->saved_tx_bytes);
+ mux->stats.tx_msgs +=
+ psock->stats.tx_msgs - psock->saved_tx_msgs;
+ psock->saved_tx_msgs = psock->stats.tx_msgs;
+ psock->saved_tx_bytes = psock->stats.tx_bytes;
+}
+
+static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
+
+/* KCM is ready to receive messages on its queue-- either the KCM is new or
+ * has become unblocked after being blocked on full socket buffer. Queue any
+ * pending ready messages on a psock. RX mux lock held.
+ */
+static void kcm_rcv_ready(struct kcm_sock *kcm)
+{
+ struct kcm_mux *mux = kcm->mux;
+ struct kcm_psock *psock;
+ struct sk_buff *skb;
+
+ if (unlikely(kcm->rx_wait || kcm->rx_psock || kcm->rx_disabled))
+ return;
+
+ while (unlikely((skb = __skb_dequeue(&mux->rx_hold_queue)))) {
+ if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
+ /* Assuming buffer limit has been reached */
+ skb_queue_head(&mux->rx_hold_queue, skb);
+ WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
+ return;
+ }
+ }
+
+ while (!list_empty(&mux->psocks_ready)) {
+ psock = list_first_entry(&mux->psocks_ready, struct kcm_psock,
+ psock_ready_list);
+
+ if (kcm_queue_rcv_skb(&kcm->sk, psock->ready_rx_msg)) {
+ /* Assuming buffer limit has been reached */
+ WARN_ON(!sk_rmem_alloc_get(&kcm->sk));
+ return;
+ }
+
+ /* Consumed the ready message on the psock. Schedule rx_work to
+ * get more messages.
+ */
+ list_del(&psock->psock_ready_list);
+ psock->ready_rx_msg = NULL;
+
+ /* Commit clearing of ready_rx_msg for queuing work */
+ smp_mb();
+
+ queue_work(kcm_wq, &psock->rx_work);
+ }
+
+ /* Buffer limit is okay now, add to ready list */
+ list_add_tail(&kcm->wait_rx_list,
+ &kcm->mux->kcm_rx_waiters);
+ kcm->rx_wait = true;
+}
+
+static void kcm_rfree(struct sk_buff *skb)
+{
+ struct sock *sk = skb->sk;
+ struct kcm_sock *kcm = kcm_sk(sk);
+ struct kcm_mux *mux = kcm->mux;
+ unsigned int len = skb->truesize;
+
+ sk_mem_uncharge(sk, len);
+ atomic_sub(len, &sk->sk_rmem_alloc);
+
+ /* For reading rx_wait and rx_psock without holding lock */
+ smp_mb__after_atomic();
+
+ if (!kcm->rx_wait && !kcm->rx_psock &&
+ sk_rmem_alloc_get(sk) < sk->sk_rcvlowat) {
+ spin_lock_bh(&mux->rx_lock);
+ kcm_rcv_ready(kcm);
+ spin_unlock_bh(&mux->rx_lock);
+ }
+}
+
+static int kcm_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
+{
+ struct sk_buff_head *list = &sk->sk_receive_queue;
+
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ return -ENOMEM;
+
+ if (!sk_rmem_schedule(sk, skb, skb->truesize))
+ return -ENOBUFS;
+
+ skb->dev = NULL;
+
+ skb_orphan(skb);
+ skb->sk = sk;
+ skb->destructor = kcm_rfree;
+ atomic_add(skb->truesize, &sk->sk_rmem_alloc);
+ sk_mem_charge(sk, skb->truesize);
+
+ skb_queue_tail(list, skb);
+
+ if (!sock_flag(sk, SOCK_DEAD))
+ sk->sk_data_ready(sk);
+
+ return 0;
+}
+
+/* Requeue received messages for a kcm socket to other kcm sockets. This is
+ * called with a kcm socket is receive disabled.
+ * RX mux lock held.
+ */
+static void requeue_rx_msgs(struct kcm_mux *mux, struct sk_buff_head *head)
+{
+ struct sk_buff *skb;
+ struct kcm_sock *kcm;
+
+ while ((skb = __skb_dequeue(head))) {
+ /* Reset destructor to avoid calling kcm_rcv_ready */
+ skb->destructor = sock_rfree;
+ skb_orphan(skb);
+try_again:
+ if (list_empty(&mux->kcm_rx_waiters)) {
+ skb_queue_tail(&mux->rx_hold_queue, skb);
+ continue;
+ }
+
+ kcm = list_first_entry(&mux->kcm_rx_waiters,
+ struct kcm_sock, wait_rx_list);
+
+ if (kcm_queue_rcv_skb(&kcm->sk, skb)) {
+ /* Should mean socket buffer full */
+ list_del(&kcm->wait_rx_list);
+ kcm->rx_wait = false;
+
+ /* Commit rx_wait to read in kcm_free */
+ smp_wmb();
+
+ goto try_again;
+ }
+ }
+}
+
+/* Lower sock lock held */
+static struct kcm_sock *reserve_rx_kcm(struct kcm_psock *psock,
+ struct sk_buff *head)
+{
+ struct kcm_mux *mux = psock->mux;
+ struct kcm_sock *kcm;
+
+ WARN_ON(psock->ready_rx_msg);
+
+ if (psock->rx_kcm)
+ return psock->rx_kcm;
+
+ spin_lock_bh(&mux->rx_lock);
+
+ if (psock->rx_kcm) {
+ spin_unlock_bh(&mux->rx_lock);
+ return psock->rx_kcm;
+ }
+
+ kcm_update_rx_mux_stats(mux, psock);
+
+ if (list_empty(&mux->kcm_rx_waiters)) {
+ psock->ready_rx_msg = head;
+ list_add_tail(&psock->psock_ready_list,
+ &mux->psocks_ready);
+ spin_unlock_bh(&mux->rx_lock);
+ return NULL;
+ }
+
+ kcm = list_first_entry(&mux->kcm_rx_waiters,
+ struct kcm_sock, wait_rx_list);
+ list_del(&kcm->wait_rx_list);
+ kcm->rx_wait = false;
+
+ psock->rx_kcm = kcm;
+ kcm->rx_psock = psock;
+
+ spin_unlock_bh(&mux->rx_lock);
+
+ return kcm;
+}
+
+static void kcm_done(struct kcm_sock *kcm);
+
+static void kcm_done_work(struct work_struct *w)
+{
+ kcm_done(container_of(w, struct kcm_sock, done_work));
+}
+
+/* Lower sock held */
+static void unreserve_rx_kcm(struct kcm_psock *psock,
+ bool rcv_ready)
+{
+ struct kcm_sock *kcm = psock->rx_kcm;
+ struct kcm_mux *mux = psock->mux;
+
+ if (!kcm)
+ return;
+
+ spin_lock_bh(&mux->rx_lock);
+
+ psock->rx_kcm = NULL;
+ kcm->rx_psock = NULL;
+
+ /* Commit kcm->rx_psock before sk_rmem_alloc_get to sync with
+ * kcm_rfree
+ */
+ smp_mb();
+
+ if (unlikely(kcm->done)) {
+ spin_unlock_bh(&mux->rx_lock);
+
+ /* Need to run kcm_done in a task since we need to qcquire
+ * callback locks which may already be held here.
+ */
+ INIT_WORK(&kcm->done_work, kcm_done_work);
+ schedule_work(&kcm->done_work);
+ return;
+ }
+
+ if (unlikely(kcm->rx_disabled)) {
+ requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
+ } else if (rcv_ready || unlikely(!sk_rmem_alloc_get(&kcm->sk))) {
+ /* Check for degenerative race with rx_wait that all
+ * data was dequeued (accounted for in kcm_rfree).
+ */
+ kcm_rcv_ready(kcm);
+ }
+ spin_unlock_bh(&mux->rx_lock);
+}
+
+static void kcm_start_rx_timer(struct kcm_psock *psock)
+{
+ if (psock->sk->sk_rcvtimeo)
+ mod_timer(&psock->rx_msg_timer, psock->sk->sk_rcvtimeo);
+}
+
+/* Macro to invoke filter function. */
+#define KCM_RUN_FILTER(prog, ctx) \
+ (*prog->bpf_func)(ctx, prog->insnsi)
+
+/* Lower socket lock held */
+static int kcm_tcp_recv(read_descriptor_t *desc, struct sk_buff *orig_skb,
+ unsigned int orig_offset, size_t orig_len)
+{
+ struct kcm_psock *psock = (struct kcm_psock *)desc->arg.data;
+ struct kcm_rx_msg *rxm;
+ struct kcm_sock *kcm;
+ struct sk_buff *head, *skb;
+ size_t eaten = 0, cand_len;
+ ssize_t extra;
+ int err;
+ bool cloned_orig = false;
+
+ if (psock->ready_rx_msg)
+ return 0;
+
+ head = psock->rx_skb_head;
+ if (head) {
+ /* Message already in progress */
+
+ rxm = kcm_rx_msg(head);
+ if (unlikely(rxm->early_eaten)) {
+ /* Already some number of bytes on the receive sock
+ * data saved in rx_skb_head, just indicate they
+ * are consumed.
+ */
+ eaten = orig_len <= rxm->early_eaten ?
+ orig_len : rxm->early_eaten;
+ rxm->early_eaten -= eaten;
+
+ return eaten;
+ }
+
+ if (unlikely(orig_offset)) {
+ /* Getting data with a non-zero offset when a message is
+ * in progress is not expected. If it does happen, we
+ * need to clone and pull since we can't deal with
+ * offsets in the skbs for a message expect in the head.
+ */
+ orig_skb = skb_clone(orig_skb, GFP_ATOMIC);
+ if (!orig_skb) {
+ KCM_STATS_INCR(psock->stats.rx_mem_fail);
+ desc->error = -ENOMEM;
+ return 0;
+ }
+ if (!pskb_pull(orig_skb, orig_offset)) {
+ KCM_STATS_INCR(psock->stats.rx_mem_fail);
+ kfree_skb(orig_skb);
+ desc->error = -ENOMEM;
+ return 0;
+ }
+ cloned_orig = true;
+ orig_offset = 0;
+ }
+
+ if (!psock->rx_skb_nextp) {
+ /* We are going to append to the frags_list of head.
+ * Need to unshare the frag_list.
+ */
+ err = skb_unclone(head, GFP_ATOMIC);
+ if (err) {
+ KCM_STATS_INCR(psock->stats.rx_mem_fail);
+ desc->error = err;
+ return 0;
+ }
+
+ if (unlikely(skb_shinfo(head)->frag_list)) {
+ /* We can't append to an sk_buff that already
+ * has a frag_list. We create a new head, point
+ * the frag_list of that to the old head, and
+ * then are able to use the old head->next for
+ * appending to the message.
+ */
+ if (WARN_ON(head->next)) {
+ desc->error = -EINVAL;
+ return 0;
+ }
+
+ skb = alloc_skb(0, GFP_ATOMIC);
+ if (!skb) {
+ KCM_STATS_INCR(psock->stats.rx_mem_fail);
+ desc->error = -ENOMEM;
+ return 0;
+ }
+ skb->len = head->len;
+ skb->data_len = head->len;
+ skb->truesize = head->truesize;
+ *kcm_rx_msg(skb) = *kcm_rx_msg(head);
+ psock->rx_skb_nextp = &head->next;
+ skb_shinfo(skb)->frag_list = head;
+ psock->rx_skb_head = skb;
+ head = skb;
+ } else {
+ psock->rx_skb_nextp =
+ &skb_shinfo(head)->frag_list;
+ }
+ }
+ }
+
+ while (eaten < orig_len) {
+ /* Always clone since we will consume something */
+ skb = skb_clone(orig_skb, GFP_ATOMIC);
+ if (!skb) {
+ KCM_STATS_INCR(psock->stats.rx_mem_fail);
+ desc->error = -ENOMEM;
+ break;
+ }
+
+ cand_len = orig_len - eaten;
+
+ head = psock->rx_skb_head;
+ if (!head) {
+ head = skb;
+ psock->rx_skb_head = head;
+ /* Will set rx_skb_nextp on next packet if needed */
+ psock->rx_skb_nextp = NULL;
+ rxm = kcm_rx_msg(head);
+ memset(rxm, 0, sizeof(*rxm));
+ rxm->offset = orig_offset + eaten;
+ } else {
+ /* Unclone since we may be appending to an skb that we
+ * already share a frag_list with.
+ */
+ err = skb_unclone(skb, GFP_ATOMIC);
+ if (err) {
+ KCM_STATS_INCR(psock->stats.rx_mem_fail);
+ desc->error = err;
+ break;
+ }
+
+ rxm = kcm_rx_msg(head);
+ *psock->rx_skb_nextp = skb;
+ psock->rx_skb_nextp = &skb->next;
+ head->data_len += skb->len;
+ head->len += skb->len;
+ head->truesize += skb->truesize;
+ }
+
+ if (!rxm->full_len) {
+ ssize_t len;
+
+ len = KCM_RUN_FILTER(psock->bpf_prog, head);
+
+ if (!len) {
+ /* Need more header to determine length */
+ if (!rxm->accum_len) {
+ /* Start RX timer for new message */
+ kcm_start_rx_timer(psock);
+ }
+ rxm->accum_len += cand_len;
+ eaten += cand_len;
+ KCM_STATS_INCR(psock->stats.rx_need_more_hdr);
+ WARN_ON(eaten != orig_len);
+ break;
+ } else if (len > psock->sk->sk_rcvbuf) {
+ /* Message length exceeds maximum allowed */
+ KCM_STATS_INCR(psock->stats.rx_msg_too_big);
+ desc->error = -EMSGSIZE;
+ psock->rx_skb_head = NULL;
+ kcm_abort_rx_psock(psock, EMSGSIZE, head);
+ break;
+ } else if (len <= (ssize_t)head->len -
+ skb->len - rxm->offset) {
+ /* Length must be into new skb (and also
+ * greater than zero)
+ */
+ KCM_STATS_INCR(psock->stats.rx_bad_hdr_len);
+ desc->error = -EPROTO;
+ psock->rx_skb_head = NULL;
+ kcm_abort_rx_psock(psock, EPROTO, head);
+ break;
+ }
+
+ rxm->full_len = len;
+ }
+
+ extra = (ssize_t)(rxm->accum_len + cand_len) - rxm->full_len;
+
+ if (extra < 0) {
+ /* Message not complete yet. */
+ if (rxm->full_len - rxm->accum_len >
+ tcp_inq(psock->sk)) {
+ /* Don't have the whole messages in the socket
+ * buffer. Set psock->rx_need_bytes to wait for
+ * the rest of the message. Also, set "early
+ * eaten" since we've already buffered the skb
+ * but don't consume yet per tcp_read_sock.
+ */
+
+ if (!rxm->accum_len) {
+ /* Start RX timer for new message */
+ kcm_start_rx_timer(psock);
+ }
+
+ psock->rx_need_bytes = rxm->full_len -
+ rxm->accum_len;
+ rxm->accum_len += cand_len;
+ rxm->early_eaten = cand_len;
+ KCM_STATS_ADD(psock->stats.rx_bytes, cand_len);
+ desc->count = 0; /* Stop reading socket */
+ break;
+ }
+ rxm->accum_len += cand_len;
+ eaten += cand_len;
+ WARN_ON(eaten != orig_len);
+ break;
+ }
+
+ /* Positive extra indicates ore bytes than needed for the
+ * message
+ */
+
+ WARN_ON(extra > cand_len);
+
+ eaten += (cand_len - extra);
+
+ /* Hurray, we have a new message! */
+ del_timer(&psock->rx_msg_timer);
+ psock->rx_skb_head = NULL;
+ KCM_STATS_INCR(psock->stats.rx_msgs);
+
+try_queue:
+ kcm = reserve_rx_kcm(psock, head);
+ if (!kcm) {
+ /* Unable to reserve a KCM, message is held in psock. */
+ break;
+ }
+
+ if (kcm_queue_rcv_skb(&kcm->sk, head)) {
+ /* Should mean socket buffer full */
+ unreserve_rx_kcm(psock, false);
+ goto try_queue;
+ }
+ }
+
+ if (cloned_orig)
+ kfree_skb(orig_skb);
+
+ KCM_STATS_ADD(psock->stats.rx_bytes, eaten);
+
+ return eaten;
+}
+
+/* Called with lock held on lower socket */
+static int psock_tcp_read_sock(struct kcm_psock *psock)
+{
+ read_descriptor_t desc;
+
+ desc.arg.data = psock;
+ desc.error = 0;
+ desc.count = 1; /* give more than one skb per call */
+
+ /* sk should be locked here, so okay to do tcp_read_sock */
+ tcp_read_sock(psock->sk, &desc, kcm_tcp_recv);
+
+ unreserve_rx_kcm(psock, true);
+
+ return desc.error;
+}
+
+/* Lower sock lock held */
+static void psock_tcp_data_ready(struct sock *sk)
+{
+ struct kcm_psock *psock;
+
+ read_lock_bh(&sk->sk_callback_lock);
+
+ psock = (struct kcm_psock *)sk->sk_user_data;
+ if (unlikely(!psock || psock->rx_stopped))
+ goto out;
+
+ if (psock->ready_rx_msg)
+ goto out;
+
+ if (psock->rx_need_bytes) {
+ if (tcp_inq(sk) >= psock->rx_need_bytes)
+ psock->rx_need_bytes = 0;
+ else
+ goto out;
+ }
+
+ if (psock_tcp_read_sock(psock) == -ENOMEM)
+ queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
+
+out:
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void do_psock_rx_work(struct kcm_psock *psock)
+{
+ read_descriptor_t rd_desc;
+ struct sock *csk = psock->sk;
+
+ /* We need the read lock to synchronize with psock_tcp_data_ready. We
+ * need the socket lock for calling tcp_read_sock.
+ */
+ lock_sock(csk);
+ read_lock_bh(&csk->sk_callback_lock);
+
+ if (unlikely(csk->sk_user_data != psock))
+ goto out;
+
+ if (unlikely(psock->rx_stopped))
+ goto out;
+
+ if (psock->ready_rx_msg)
+ goto out;
+
+ rd_desc.arg.data = psock;
+
+ if (psock_tcp_read_sock(psock) == -ENOMEM)
+ queue_delayed_work(kcm_wq, &psock->rx_delayed_work, 0);
+
+out:
+ read_unlock_bh(&csk->sk_callback_lock);
+ release_sock(csk);
+}
+
+static void psock_rx_work(struct work_struct *w)
+{
+ do_psock_rx_work(container_of(w, struct kcm_psock, rx_work));
+}
+
+static void psock_rx_delayed_work(struct work_struct *w)
+{
+ do_psock_rx_work(container_of(w, struct kcm_psock,
+ rx_delayed_work.work));
+}
+
+static void psock_tcp_state_change(struct sock *sk)
+{
+ /* TCP only does a POLLIN for a half close. Do a POLLHUP here
+ * since application will normally not poll with POLLIN
+ * on the TCP sockets.
+ */
+
+ report_csk_error(sk, EPIPE);
+}
+
+static void psock_tcp_write_space(struct sock *sk)
+{
+ struct kcm_psock *psock;
+ struct kcm_mux *mux;
+ struct kcm_sock *kcm;
+
+ read_lock_bh(&sk->sk_callback_lock);
+
+ psock = (struct kcm_psock *)sk->sk_user_data;
+ if (unlikely(!psock))
+ goto out;
+
+ mux = psock->mux;
+
+ spin_lock_bh(&mux->lock);
+
+ /* Check if the socket is reserved so someone is waiting for sending. */
+ kcm = psock->tx_kcm;
+ if (kcm)
+ queue_work(kcm_wq, &kcm->tx_work);
+
+ spin_unlock_bh(&mux->lock);
+out:
+ read_unlock_bh(&sk->sk_callback_lock);
+}
+
+static void unreserve_psock(struct kcm_sock *kcm);
+
+/* kcm sock is locked. */
+static struct kcm_psock *reserve_psock(struct kcm_sock *kcm)
+{
+ struct kcm_mux *mux = kcm->mux;
+ struct kcm_psock *psock;
+
+ psock = kcm->tx_psock;
+
+ smp_rmb(); /* Must read tx_psock before tx_wait */
+
+ if (psock) {
+ WARN_ON(kcm->tx_wait);
+ if (unlikely(psock->tx_stopped))
+ unreserve_psock(kcm);
+ else
+ return kcm->tx_psock;
+ }
+
+ spin_lock_bh(&mux->lock);
+
+ /* Check again under lock to see if psock was reserved for this
+ * psock via psock_unreserve.
+ */
+ psock = kcm->tx_psock;
+ if (unlikely(psock)) {
+ WARN_ON(kcm->tx_wait);
+ spin_unlock_bh(&mux->lock);
+ return kcm->tx_psock;
+ }
+
+ if (!list_empty(&mux->psocks_avail)) {
+ psock = list_first_entry(&mux->psocks_avail,
+ struct kcm_psock,
+ psock_avail_list);
+ list_del(&psock->psock_avail_list);
+ if (kcm->tx_wait) {
+ list_del(&kcm->wait_psock_list);
+ kcm->tx_wait = false;
+ }
+ kcm->tx_psock = psock;
+ psock->tx_kcm = kcm;
+ KCM_STATS_INCR(psock->stats.reserved);
+ } else if (!kcm->tx_wait) {
+ list_add_tail(&kcm->wait_psock_list,
+ &mux->kcm_tx_waiters);
+ kcm->tx_wait = true;
+ }
+
+ spin_unlock_bh(&mux->lock);
+
+ return psock;
+}
+
+/* mux lock held */
+static void psock_now_avail(struct kcm_psock *psock)
+{
+ struct kcm_mux *mux = psock->mux;
+ struct kcm_sock *kcm;
+
+ if (list_empty(&mux->kcm_tx_waiters)) {
+ list_add_tail(&psock->psock_avail_list,
+ &mux->psocks_avail);
+ } else {
+ kcm = list_first_entry(&mux->kcm_tx_waiters,
+ struct kcm_sock,
+ wait_psock_list);
+ list_del(&kcm->wait_psock_list);
+ kcm->tx_wait = false;
+ psock->tx_kcm = kcm;
+
+ /* Commit before changing tx_psock since that is read in
+ * reserve_psock before queuing work.
+ */
+ smp_mb();
+
+ kcm->tx_psock = psock;
+ KCM_STATS_INCR(psock->stats.reserved);
+ queue_work(kcm_wq, &kcm->tx_work);
+ }
+}
+
+/* kcm sock is locked. */
+static void unreserve_psock(struct kcm_sock *kcm)
+{
+ struct kcm_psock *psock;
+ struct kcm_mux *mux = kcm->mux;
+
+ spin_lock_bh(&mux->lock);
+
+ psock = kcm->tx_psock;
+
+ if (WARN_ON(!psock)) {
+ spin_unlock_bh(&mux->lock);
+ return;
+ }
+
+ smp_rmb(); /* Read tx_psock before tx_wait */
+
+ kcm_update_tx_mux_stats(mux, psock);
+
+ WARN_ON(kcm->tx_wait);
+
+ kcm->tx_psock = NULL;
+ psock->tx_kcm = NULL;
+ KCM_STATS_INCR(psock->stats.unreserved);
+
+ if (unlikely(psock->tx_stopped)) {
+ if (psock->done) {
+ /* Deferred free */
+ list_del(&psock->psock_list);
+ mux->psocks_cnt--;
+ sock_put(psock->sk);
+ fput(psock->sk->sk_socket->file);
+ kmem_cache_free(kcm_psockp, psock);
+ }
+
+ /* Don't put back on available list */
+
+ spin_unlock_bh(&mux->lock);
+
+ return;
+ }
+
+ psock_now_avail(psock);
+
+ spin_unlock_bh(&mux->lock);
+}
+
+static void kcm_report_tx_retry(struct kcm_sock *kcm)
+{
+ struct kcm_mux *mux = kcm->mux;
+
+ spin_lock_bh(&mux->lock);
+ KCM_STATS_INCR(mux->stats.tx_retries);
+ spin_unlock_bh(&mux->lock);
+}
+
+/* Write any messages ready on the kcm socket. Called with kcm sock lock
+ * held. Return bytes actually sent or error.
+ */
+static int kcm_write_msgs(struct kcm_sock *kcm)
+{
+ struct sock *sk = &kcm->sk;
+ struct kcm_psock *psock;
+ struct sk_buff *skb, *head;
+ struct kcm_tx_msg *txm;
+ unsigned short fragidx, frag_offset;
+ unsigned int sent, total_sent = 0;
+ int ret = 0;
+
+ kcm->tx_wait_more = false;
+ psock = kcm->tx_psock;
+ if (unlikely(psock && psock->tx_stopped)) {
+ /* A reserved psock was aborted asynchronously. Unreserve
+ * it and we'll retry the message.
+ */
+ unreserve_psock(kcm);
+ kcm_report_tx_retry(kcm);
+ if (skb_queue_empty(&sk->sk_write_queue))
+ return 0;
+
+ kcm_tx_msg(skb_peek(&sk->sk_write_queue))->sent = 0;
+
+ } else if (skb_queue_empty(&sk->sk_write_queue)) {
+ return 0;
+ }
+
+ head = skb_peek(&sk->sk_write_queue);
+ txm = kcm_tx_msg(head);
+
+ if (txm->sent) {
+ /* Send of first skbuff in queue already in progress */
+ if (WARN_ON(!psock)) {
+ ret = -EINVAL;
+ goto out;
+ }
+ sent = txm->sent;
+ frag_offset = txm->frag_offset;
+ fragidx = txm->fragidx;
+ skb = txm->frag_skb;
+
+ goto do_frag;
+ }
+
+try_again:
+ psock = reserve_psock(kcm);
+ if (!psock)
+ goto out;
+
+ do {
+ skb = head;
+ txm = kcm_tx_msg(head);
+ sent = 0;
+
+do_frag_list:
+ if (WARN_ON(!skb_shinfo(skb)->nr_frags)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ for (fragidx = 0; fragidx < skb_shinfo(skb)->nr_frags;
+ fragidx++) {
+ skb_frag_t *frag;
+
+ frag_offset = 0;
+do_frag:
+ frag = &skb_shinfo(skb)->frags[fragidx];
+ if (WARN_ON(!frag->size)) {
+ ret = -EINVAL;
+ goto out;
+ }
+
+ ret = kernel_sendpage(psock->sk->sk_socket,
+ frag->page.p,
+ frag->page_offset + frag_offset,
+ frag->size - frag_offset,
+ MSG_DONTWAIT);
+ if (ret <= 0) {
+ if (ret == -EAGAIN) {
+ /* Save state to try again when there's
+ * write space on the socket
+ */
+ txm->sent = sent;
+ txm->frag_offset = frag_offset;
+ txm->fragidx = fragidx;
+ txm->frag_skb = skb;
+
+ ret = 0;
+ goto out;
+ }
+
+ /* Hard failure in sending message, abort this
+ * psock since it has lost framing
+ * synchonization and retry sending the
+ * message from the beginning.
+ */
+ kcm_abort_tx_psock(psock, ret ? -ret : EPIPE,
+ true);
+ unreserve_psock(kcm);
+
+ txm->sent = 0;
+ kcm_report_tx_retry(kcm);
+ ret = 0;
+
+ goto try_again;
+ }
+
+ sent += ret;
+ frag_offset += ret;
+ KCM_STATS_ADD(psock->stats.tx_bytes, ret);
+ if (frag_offset < frag->size) {
+ /* Not finished with this frag */
+ goto do_frag;
+ }
+ }
+
+ if (skb == head) {
+ if (skb_has_frag_list(skb)) {
+ skb = skb_shinfo(skb)->frag_list;
+ goto do_frag_list;
+ }
+ } else if (skb->next) {
+ skb = skb->next;
+ goto do_frag_list;
+ }
+
+ /* Successfully sent the whole packet, account for it. */
+ skb_dequeue(&sk->sk_write_queue);
+ kfree_skb(head);
+ sk->sk_wmem_queued -= sent;
+ total_sent += sent;
+ KCM_STATS_INCR(psock->stats.tx_msgs);
+ } while ((head = skb_peek(&sk->sk_write_queue)));
+out:
+ if (!head) {
+ /* Done with all queued messages. */
+ WARN_ON(!skb_queue_empty(&sk->sk_write_queue));
+ unreserve_psock(kcm);
+ }
+
+ /* Check if write space is available */
+ sk->sk_write_space(sk);
+
+ return total_sent ? : ret;
+}
+
+static void kcm_tx_work(struct work_struct *w)
+{
+ struct kcm_sock *kcm = container_of(w, struct kcm_sock, tx_work);
+ struct sock *sk = &kcm->sk;
+ int err;
+
+ lock_sock(sk);
+
+ /* Primarily for SOCK_DGRAM sockets, also handle asynchronous tx
+ * aborts
+ */
+ err = kcm_write_msgs(kcm);
+ if (err < 0) {
+ /* Hard failure in write, report error on KCM socket */
+ pr_warn("KCM: Hard failure on kcm_write_msgs %d\n", err);
+ report_csk_error(&kcm->sk, -err);
+ goto out;
+ }
+
+ /* Primarily for SOCK_SEQPACKET sockets */
+ if (likely(sk->sk_socket) &&
+ test_bit(SOCK_NOSPACE, &sk->sk_socket->flags)) {
+ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ sk->sk_write_space(sk);
+ }
+
+out:
+ release_sock(sk);
+}
+
+static void kcm_push(struct kcm_sock *kcm)
+{
+ if (kcm->tx_wait_more)
+ kcm_write_msgs(kcm);
+}
+
+static ssize_t kcm_sendpage(struct socket *sock, struct page *page,
+ int offset, size_t size, int flags)
+
+{
+ struct sock *sk = sock->sk;
+ struct kcm_sock *kcm = kcm_sk(sk);
+ struct sk_buff *skb = NULL, *head = NULL;
+ long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
+ bool eor;
+ int err = 0;
+ int i;
+
+ if (flags & MSG_SENDPAGE_NOTLAST)
+ flags |= MSG_MORE;
+
+ /* No MSG_EOR from splice, only look at MSG_MORE */
+ eor = !(flags & MSG_MORE);
+
+ lock_sock(sk);
+
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+ err = -EPIPE;
+ if (sk->sk_err)
+ goto out_error;
+
+ if (kcm->seq_skb) {
+ /* Previously opened message */
+ head = kcm->seq_skb;
+ skb = kcm_tx_msg(head)->last_skb;
+ i = skb_shinfo(skb)->nr_frags;
+
+ if (skb_can_coalesce(skb, i, page, offset)) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], size);
+ skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+ goto coalesced;
+ }
+
+ if (i >= MAX_SKB_FRAGS) {
+ struct sk_buff *tskb;
+
+ tskb = alloc_skb(0, sk->sk_allocation);
+ while (!tskb) {
+ kcm_push(kcm);
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_error;
+ }
+
+ if (head == skb)
+ skb_shinfo(head)->frag_list = tskb;
+ else
+ skb->next = tskb;
+
+ skb = tskb;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ i = 0;
+ }
+ } else {
+ /* Call the sk_stream functions to manage the sndbuf mem. */
+ if (!sk_stream_memory_free(sk)) {
+ kcm_push(kcm);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_error;
+ }
+
+ head = alloc_skb(0, sk->sk_allocation);
+ while (!head) {
+ kcm_push(kcm);
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_error;
+ }
+
+ skb = head;
+ i = 0;
+ }
+
+ get_page(page);
+ skb_fill_page_desc(skb, i, page, offset, size);
+ skb_shinfo(skb)->tx_flags |= SKBTX_SHARED_FRAG;
+
+coalesced:
+ skb->len += size;
+ skb->data_len += size;
+ skb->truesize += size;
+ sk->sk_wmem_queued += size;
+ sk_mem_charge(sk, size);
+
+ if (head != skb) {
+ head->len += size;
+ head->data_len += size;
+ head->truesize += size;
+ }
+
+ if (eor) {
+ bool not_busy = skb_queue_empty(&sk->sk_write_queue);
+
+ /* Message complete, queue it on send buffer */
+ __skb_queue_tail(&sk->sk_write_queue, head);
+ kcm->seq_skb = NULL;
+ KCM_STATS_INCR(kcm->stats.tx_msgs);
+
+ if (flags & MSG_BATCH) {
+ kcm->tx_wait_more = true;
+ } else if (kcm->tx_wait_more || not_busy) {
+ err = kcm_write_msgs(kcm);
+ if (err < 0) {
+ /* We got a hard error in write_msgs but have
+ * already queued this message. Report an error
+ * in the socket, but don't affect return value
+ * from sendmsg
+ */
+ pr_warn("KCM: Hard failure on kcm_write_msgs\n");
+ report_csk_error(&kcm->sk, -err);
+ }
+ }
+ } else {
+ /* Message not complete, save state */
+ kcm->seq_skb = head;
+ kcm_tx_msg(head)->last_skb = skb;
+ }
+
+ KCM_STATS_ADD(kcm->stats.tx_bytes, size);
+
+ release_sock(sk);
+ return size;
+
+out_error:
+ kcm_push(kcm);
+
+ err = sk_stream_error(sk, flags, err);
+
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ sk->sk_write_space(sk);
+
+ release_sock(sk);
+ return err;
+}
+
+static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+{
+ struct sock *sk = sock->sk;
+ struct kcm_sock *kcm = kcm_sk(sk);
+ struct sk_buff *skb = NULL, *head = NULL;
+ size_t copy, copied = 0;
+ long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
+ int eor = (sock->type == SOCK_DGRAM) ?
+ !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR);
+ int err = -EPIPE;
+
+ lock_sock(sk);
+
+ /* Per tcp_sendmsg this should be in poll */
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
+
+ if (sk->sk_err)
+ goto out_error;
+
+ if (kcm->seq_skb) {
+ /* Previously opened message */
+ head = kcm->seq_skb;
+ skb = kcm_tx_msg(head)->last_skb;
+ goto start;
+ }
+
+ /* Call the sk_stream functions to manage the sndbuf mem. */
+ if (!sk_stream_memory_free(sk)) {
+ kcm_push(kcm);
+ set_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_error;
+ }
+
+ /* New message, alloc head skb */
+ head = alloc_skb(0, sk->sk_allocation);
+ while (!head) {
+ kcm_push(kcm);
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_error;
+
+ head = alloc_skb(0, sk->sk_allocation);
+ }
+
+ skb = head;
+
+ /* Set ip_summed to CHECKSUM_UNNECESSARY to avoid calling
+ * csum_and_copy_from_iter from skb_do_copy_data_nocache.
+ */
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+
+start:
+ while (msg_data_left(msg)) {
+ bool merge = true;
+ int i = skb_shinfo(skb)->nr_frags;
+ struct page_frag *pfrag = sk_page_frag(sk);
+
+ if (!sk_page_frag_refill(sk, pfrag))
+ goto wait_for_memory;
+
+ if (!skb_can_coalesce(skb, i, pfrag->page,
+ pfrag->offset)) {
+ if (i == MAX_SKB_FRAGS) {
+ struct sk_buff *tskb;
+
+ tskb = alloc_skb(0, sk->sk_allocation);
+ if (!tskb)
+ goto wait_for_memory;
+
+ if (head == skb)
+ skb_shinfo(head)->frag_list = tskb;
+ else
+ skb->next = tskb;
+
+ skb = tskb;
+ skb->ip_summed = CHECKSUM_UNNECESSARY;
+ continue;
+ }
+ merge = false;
+ }
+
+ copy = min_t(int, msg_data_left(msg),
+ pfrag->size - pfrag->offset);
+
+ if (!sk_wmem_schedule(sk, copy))
+ goto wait_for_memory;
+
+ err = skb_copy_to_page_nocache(sk, &msg->msg_iter, skb,
+ pfrag->page,
+ pfrag->offset,
+ copy);
+ if (err)
+ goto out_error;
+
+ /* Update the skb. */
+ if (merge) {
+ skb_frag_size_add(&skb_shinfo(skb)->frags[i - 1], copy);
+ } else {
+ skb_fill_page_desc(skb, i, pfrag->page,
+ pfrag->offset, copy);
+ get_page(pfrag->page);
+ }
+
+ pfrag->offset += copy;
+ copied += copy;
+ if (head != skb) {
+ head->len += copy;
+ head->data_len += copy;
+ }
+
+ continue;
+
+wait_for_memory:
+ kcm_push(kcm);
+ err = sk_stream_wait_memory(sk, &timeo);
+ if (err)
+ goto out_error;
+ }
+
+ if (eor) {
+ bool not_busy = skb_queue_empty(&sk->sk_write_queue);
+
+ /* Message complete, queue it on send buffer */
+ __skb_queue_tail(&sk->sk_write_queue, head);
+ kcm->seq_skb = NULL;
+ KCM_STATS_INCR(kcm->stats.tx_msgs);
+
+ if (msg->msg_flags & MSG_BATCH) {
+ kcm->tx_wait_more = true;
+ } else if (kcm->tx_wait_more || not_busy) {
+ err = kcm_write_msgs(kcm);
+ if (err < 0) {
+ /* We got a hard error in write_msgs but have
+ * already queued this message. Report an error
+ * in the socket, but don't affect return value
+ * from sendmsg
+ */
+ pr_warn("KCM: Hard failure on kcm_write_msgs\n");
+ report_csk_error(&kcm->sk, -err);
+ }
+ }
+ } else {
+ /* Message not complete, save state */
+partial_message:
+ kcm->seq_skb = head;
+ kcm_tx_msg(head)->last_skb = skb;
+ }
+
+ KCM_STATS_ADD(kcm->stats.tx_bytes, copied);
+
+ release_sock(sk);
+ return copied;
+
+out_error:
+ kcm_push(kcm);
+
+ if (copied && sock->type == SOCK_SEQPACKET) {
+ /* Wrote some bytes before encountering an
+ * error, return partial success.
+ */
+ goto partial_message;
+ }
+
+ if (head != kcm->seq_skb)
+ kfree_skb(head);
+
+ err = sk_stream_error(sk, msg->msg_flags, err);
+
+ /* make sure we wake any epoll edge trigger waiter */
+ if (unlikely(skb_queue_len(&sk->sk_write_queue) == 0 && err == -EAGAIN))
+ sk->sk_write_space(sk);
+
+ release_sock(sk);
+ return err;
+}
+
+static struct sk_buff *kcm_wait_data(struct sock *sk, int flags,
+ long timeo, int *err)
+{
+ struct sk_buff *skb;
+
+ while (!(skb = skb_peek(&sk->sk_receive_queue))) {
+ if (sk->sk_err) {
+ *err = sock_error(sk);
+ return NULL;
+ }
+
+ if (sock_flag(sk, SOCK_DONE))
+ return NULL;
+
+ if ((flags & MSG_DONTWAIT) || !timeo) {
+ *err = -EAGAIN;
+ return NULL;
+ }
+
+ sk_wait_data(sk, &timeo, NULL);
+
+ /* Handle signals */
+ if (signal_pending(current)) {
+ *err = sock_intr_errno(timeo);
+ return NULL;
+ }
+ }
+
+ return skb;
+}
+
+static int kcm_recvmsg(struct socket *sock, struct msghdr *msg,
+ size_t len, int flags)
+{
+ struct sock *sk = sock->sk;
+ struct kcm_sock *kcm = kcm_sk(sk);
+ int err = 0;
+ long timeo;
+ struct kcm_rx_msg *rxm;
+ int copied = 0;
+ struct sk_buff *skb;
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ lock_sock(sk);
+
+ skb = kcm_wait_data(sk, flags, timeo, &err);
+ if (!skb)
+ goto out;
+
+ /* Okay, have a message on the receive queue */
+
+ rxm = kcm_rx_msg(skb);
+
+ if (len > rxm->full_len)
+ len = rxm->full_len;
+
+ err = skb_copy_datagram_msg(skb, rxm->offset, msg, len);
+ if (err < 0)
+ goto out;
+
+ copied = len;
+ if (likely(!(flags & MSG_PEEK))) {
+ KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
+ if (copied < rxm->full_len) {
+ if (sock->type == SOCK_DGRAM) {
+ /* Truncated message */
+ msg->msg_flags |= MSG_TRUNC;
+ goto msg_finished;
+ }
+ rxm->offset += copied;
+ rxm->full_len -= copied;
+ } else {
+msg_finished:
+ /* Finished with message */
+ msg->msg_flags |= MSG_EOR;
+ KCM_STATS_INCR(kcm->stats.rx_msgs);
+ skb_unlink(skb, &sk->sk_receive_queue);
+ kfree_skb(skb);
+ }
+ }
+
+out:
+ release_sock(sk);
+
+ return copied ? : err;
+}
+
+static ssize_t kcm_sock_splice(struct sock *sk,
+ struct pipe_inode_info *pipe,
+ struct splice_pipe_desc *spd)
+{
+ int ret;
+
+ release_sock(sk);
+ ret = splice_to_pipe(pipe, spd);
+ lock_sock(sk);
+
+ return ret;
+}
+
+static ssize_t kcm_splice_read(struct socket *sock, loff_t *ppos,
+ struct pipe_inode_info *pipe, size_t len,
+ unsigned int flags)
+{
+ struct sock *sk = sock->sk;
+ struct kcm_sock *kcm = kcm_sk(sk);
+ long timeo;
+ struct kcm_rx_msg *rxm;
+ int err = 0;
+ size_t copied;
+ struct sk_buff *skb;
+
+ /* Only support splice for SOCKSEQPACKET */
+
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
+
+ lock_sock(sk);
+
+ skb = kcm_wait_data(sk, flags, timeo, &err);
+ if (!skb)
+ goto err_out;
+
+ /* Okay, have a message on the receive queue */
+
+ rxm = kcm_rx_msg(skb);
+
+ if (len > rxm->full_len)
+ len = rxm->full_len;
+
+ copied = skb_splice_bits(skb, sk, rxm->offset, pipe, len, flags,
+ kcm_sock_splice);
+ if (copied < 0) {
+ err = copied;
+ goto err_out;
+ }
+
+ KCM_STATS_ADD(kcm->stats.rx_bytes, copied);
+
+ rxm->offset += copied;
+ rxm->full_len -= copied;
+
+ /* We have no way to return MSG_EOR. If all the bytes have been
+ * read we still leave the message in the receive socket buffer.
+ * A subsequent recvmsg needs to be done to return MSG_EOR and
+ * finish reading the message.
+ */
+
+ release_sock(sk);
+
+ return copied;
+
+err_out:
+ release_sock(sk);
+
+ return err;
+}
+
+/* kcm sock lock held */
+static void kcm_recv_disable(struct kcm_sock *kcm)
+{
+ struct kcm_mux *mux = kcm->mux;
+
+ if (kcm->rx_disabled)
+ return;
+
+ spin_lock_bh(&mux->rx_lock);
+
+ kcm->rx_disabled = 1;
+
+ /* If a psock is reserved we'll do cleanup in unreserve */
+ if (!kcm->rx_psock) {
+ if (kcm->rx_wait) {
+ list_del(&kcm->wait_rx_list);
+ kcm->rx_wait = false;
+ }
+
+ requeue_rx_msgs(mux, &kcm->sk.sk_receive_queue);
+ }
+
+ spin_unlock_bh(&mux->rx_lock);
+}
+
+/* kcm sock lock held */
+static void kcm_recv_enable(struct kcm_sock *kcm)
+{
+ struct kcm_mux *mux = kcm->mux;
+
+ if (!kcm->rx_disabled)
+ return;
+
+ spin_lock_bh(&mux->rx_lock);
+
+ kcm->rx_disabled = 0;
+ kcm_rcv_ready(kcm);
+
+ spin_unlock_bh(&mux->rx_lock);
+}
+
+static int kcm_setsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, unsigned int optlen)
+{
+ struct kcm_sock *kcm = kcm_sk(sock->sk);
+ int val, valbool;
+ int err = 0;
+
+ if (level != SOL_KCM)
+ return -ENOPROTOOPT;
+
+ if (optlen < sizeof(int))
+ return -EINVAL;
+
+ if (get_user(val, (int __user *)optval))
+ return -EINVAL;
+
+ valbool = val ? 1 : 0;
+
+ switch (optname) {
+ case KCM_RECV_DISABLE:
+ lock_sock(&kcm->sk);
+ if (valbool)
+ kcm_recv_disable(kcm);
+ else
+ kcm_recv_enable(kcm);
+ release_sock(&kcm->sk);
+ break;
+ default:
+ err = -ENOPROTOOPT;
+ }
+
+ return err;
+}
+
+static int kcm_getsockopt(struct socket *sock, int level, int optname,
+ char __user *optval, int __user *optlen)
+{
+ struct kcm_sock *kcm = kcm_sk(sock->sk);
+ int val, len;
+
+ if (level != SOL_KCM)
+ return -ENOPROTOOPT;
+
+ if (get_user(len, optlen))
+ return -EFAULT;
+
+ len = min_t(unsigned int, len, sizeof(int));
+ if (len < 0)
+ return -EINVAL;
+
+ switch (optname) {
+ case KCM_RECV_DISABLE:
+ val = kcm->rx_disabled;
+ break;
+ default:
+ return -ENOPROTOOPT;
+ }
+
+ if (put_user(len, optlen))
+ return -EFAULT;
+ if (copy_to_user(optval, &val, len))
+ return -EFAULT;
+ return 0;
+}
+
+static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
+{
+ struct kcm_sock *tkcm;
+ struct list_head *head;
+ int index = 0;
+
+ /* For SOCK_SEQPACKET sock type, datagram_poll checks the sk_state, so
+ * we set sk_state, otherwise epoll_wait always returns right away with
+ * POLLHUP
+ */
+ kcm->sk.sk_state = TCP_ESTABLISHED;
+
+ /* Add to mux's kcm sockets list */
+ kcm->mux = mux;
+ spin_lock_bh(&mux->lock);
+
+ head = &mux->kcm_socks;
+ list_for_each_entry(tkcm, &mux->kcm_socks, kcm_sock_list) {
+ if (tkcm->index != index)
+ break;
+ head = &tkcm->kcm_sock_list;
+ index++;
+ }
+
+ list_add(&kcm->kcm_sock_list, head);
+ kcm->index = index;
+
+ mux->kcm_socks_cnt++;
+ spin_unlock_bh(&mux->lock);
+
+ INIT_WORK(&kcm->tx_work, kcm_tx_work);
+
+ spin_lock_bh(&mux->rx_lock);
+ kcm_rcv_ready(kcm);
+ spin_unlock_bh(&mux->rx_lock);
+}
+
+static void kcm_rx_msg_timeout(unsigned long arg)
+{
+ struct kcm_psock *psock = (struct kcm_psock *)arg;
+
+ /* Message assembly timed out */
+ KCM_STATS_INCR(psock->stats.rx_msg_timeouts);
+ kcm_abort_rx_psock(psock, ETIMEDOUT, NULL);
+}
+
+static int kcm_attach(struct socket *sock, struct socket *csock,
+ struct bpf_prog *prog)
+{
+ struct kcm_sock *kcm = kcm_sk(sock->sk);
+ struct kcm_mux *mux = kcm->mux;
+ struct sock *csk;
+ struct kcm_psock *psock = NULL, *tpsock;
+ struct list_head *head;
+ int index = 0;
+
+ if (csock->ops->family != PF_INET &&
+ csock->ops->family != PF_INET6)
+ return -EINVAL;
+
+ csk = csock->sk;
+ if (!csk)
+ return -EINVAL;
+
+ /* Only support TCP for now */
+ if (csk->sk_protocol != IPPROTO_TCP)
+ return -EINVAL;
+
+ psock = kmem_cache_zalloc(kcm_psockp, GFP_KERNEL);
+ if (!psock)
+ return -ENOMEM;
+
+ psock->mux = mux;
+ psock->sk = csk;
+ psock->bpf_prog = prog;
+
+ setup_timer(&psock->rx_msg_timer, kcm_rx_msg_timeout,
+ (unsigned long)psock);
+
+ INIT_WORK(&psock->rx_work, psock_rx_work);
+ INIT_DELAYED_WORK(&psock->rx_delayed_work, psock_rx_delayed_work);
+
+ sock_hold(csk);
+
+ write_lock_bh(&csk->sk_callback_lock);
+ psock->save_data_ready = csk->sk_data_ready;
+ psock->save_write_space = csk->sk_write_space;
+ psock->save_state_change = csk->sk_state_change;
+ csk->sk_user_data = psock;
+ csk->sk_data_ready = psock_tcp_data_ready;
+ csk->sk_write_space = psock_tcp_write_space;
+ csk->sk_state_change = psock_tcp_state_change;
+ write_unlock_bh(&csk->sk_callback_lock);
+
+ /* Finished initialization, now add the psock to the MUX. */
+ spin_lock_bh(&mux->lock);
+ head = &mux->psocks;
+ list_for_each_entry(tpsock, &mux->psocks, psock_list) {
+ if (tpsock->index != index)
+ break;
+ head = &tpsock->psock_list;
+ index++;
+ }
+
+ list_add(&psock->psock_list, head);
+ psock->index = index;
+
+ KCM_STATS_INCR(mux->stats.psock_attach);
+ mux->psocks_cnt++;
+ psock_now_avail(psock);
+ spin_unlock_bh(&mux->lock);
+
+ /* Schedule RX work in case there are already bytes queued */
+ queue_work(kcm_wq, &psock->rx_work);
+
+ return 0;
+}
+
+static int kcm_attach_ioctl(struct socket *sock, struct kcm_attach *info)
+{
+ struct socket *csock;
+ struct bpf_prog *prog;
+ int err;
+
+ csock = sockfd_lookup(info->fd, &err);
+ if (!csock)
+ return -ENOENT;
+
+ prog = bpf_prog_get(info->bpf_fd);
+ if (IS_ERR(prog)) {
+ err = PTR_ERR(prog);
+ goto out;
+ }
+
+ if (prog->type != BPF_PROG_TYPE_SOCKET_FILTER) {
+ bpf_prog_put(prog);
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = kcm_attach(sock, csock, prog);
+ if (err) {
+ bpf_prog_put(prog);
+ goto out;
+ }
+
+ /* Keep reference on file also */
+
+ return 0;
+out:
+ fput(csock->file);
+ return err;
+}
+
+static void kcm_unattach(struct kcm_psock *psock)
+{
+ struct sock *csk = psock->sk;
+ struct kcm_mux *mux = psock->mux;
+
+ /* Stop getting callbacks from TCP socket. After this there should
+ * be no way to reserve a kcm for this psock.
+ */
+ write_lock_bh(&csk->sk_callback_lock);
+ csk->sk_user_data = NULL;
+ csk->sk_data_ready = psock->save_data_ready;
+ csk->sk_write_space = psock->save_write_space;
+ csk->sk_state_change = psock->save_state_change;
+ psock->rx_stopped = 1;
+
+ if (WARN_ON(psock->rx_kcm)) {
+ write_unlock_bh(&csk->sk_callback_lock);
+ return;
+ }
+
+ spin_lock_bh(&mux->rx_lock);
+
+ /* Stop receiver activities. After this point psock should not be
+ * able to get onto ready list either through callbacks or work.
+ */
+ if (psock->ready_rx_msg) {
+ list_del(&psock->psock_ready_list);
+ kfree_skb(psock->ready_rx_msg);
+ psock->ready_rx_msg = NULL;
+ KCM_STATS_INCR(mux->stats.rx_ready_drops);
+ }
+
+ spin_unlock_bh(&mux->rx_lock);
+
+ write_unlock_bh(&csk->sk_callback_lock);
+
+ del_timer_sync(&psock->rx_msg_timer);
+ cancel_work_sync(&psock->rx_work);
+ cancel_delayed_work_sync(&psock->rx_delayed_work);
+
+ bpf_prog_put(psock->bpf_prog);
+
+ kfree_skb(psock->rx_skb_head);
+ psock->rx_skb_head = NULL;
+
+ spin_lock_bh(&mux->lock);
+
+ aggregate_psock_stats(&psock->stats, &mux->aggregate_psock_stats);
+
+ KCM_STATS_INCR(mux->stats.psock_unattach);
+
+ if (psock->tx_kcm) {
+ /* psock was reserved. Just mark it finished and we will clean
+ * up in the kcm paths, we need kcm lock which can not be
+ * acquired here.
+ */
+ KCM_STATS_INCR(mux->stats.psock_unattach_rsvd);
+ spin_unlock_bh(&mux->lock);
+
+ /* We are unattaching a socket that is reserved. Abort the
+ * socket since we may be out of sync in sending on it. We need
+ * to do this without the mux lock.
+ */
+ kcm_abort_tx_psock(psock, EPIPE, false);
+
+ spin_lock_bh(&mux->lock);
+ if (!psock->tx_kcm) {
+ /* psock now unreserved in window mux was unlocked */
+ goto no_reserved;
+ }
+ psock->done = 1;
+
+ /* Commit done before queuing work to process it */
+ smp_mb();
+
+ /* Queue tx work to make sure psock->done is handled */
+ queue_work(kcm_wq, &psock->tx_kcm->tx_work);
+ spin_unlock_bh(&mux->lock);
+ } else {
+no_reserved:
+ if (!psock->tx_stopped)
+ list_del(&psock->psock_avail_list);
+ list_del(&psock->psock_list);
+ mux->psocks_cnt--;
+ spin_unlock_bh(&mux->lock);
+
+ sock_put(csk);
+ fput(csk->sk_socket->file);
+ kmem_cache_free(kcm_psockp, psock);
+ }
+}
+
+static int kcm_unattach_ioctl(struct socket *sock, struct kcm_unattach *info)
+{
+ struct kcm_sock *kcm = kcm_sk(sock->sk);
+ struct kcm_mux *mux = kcm->mux;
+ struct kcm_psock *psock;
+ struct socket *csock;
+ struct sock *csk;
+ int err;
+
+ csock = sockfd_lookup(info->fd, &err);
+ if (!csock)
+ return -ENOENT;
+
+ csk = csock->sk;
+ if (!csk) {
+ err = -EINVAL;
+ goto out;
+ }
+
+ err = -ENOENT;
+
+ spin_lock_bh(&mux->lock);
+
+ list_for_each_entry(psock, &mux->psocks, psock_list) {
+ if (psock->sk != csk)
+ continue;
+
+ /* Found the matching psock */
+
+ if (psock->unattaching || WARN_ON(psock->done)) {
+ err = -EALREADY;
+ break;
+ }
+
+ psock->unattaching = 1;
+
+ spin_unlock_bh(&mux->lock);
+
+ kcm_unattach(psock);
+
+ err = 0;
+ goto out;
+ }
+
+ spin_unlock_bh(&mux->lock);
+
+out:
+ fput(csock->file);
+ return err;
+}
+
+static struct proto kcm_proto = {
+ .name = "KCM",
+ .owner = THIS_MODULE,
+ .obj_size = sizeof(struct kcm_sock),
+};
+
+/* Clone a kcm socket. */
+static int kcm_clone(struct socket *osock, struct kcm_clone *info,
+ struct socket **newsockp)
+{
+ struct socket *newsock;
+ struct sock *newsk;
+ struct file *newfile;
+ int err, newfd;
+
+ err = -ENFILE;
+ newsock = sock_alloc();
+ if (!newsock)
+ goto out;
+
+ newsock->type = osock->type;
+ newsock->ops = osock->ops;
+
+ __module_get(newsock->ops->owner);
+
+ newfd = get_unused_fd_flags(0);
+ if (unlikely(newfd < 0)) {
+ err = newfd;
+ goto out_fd_fail;
+ }
+
+ newfile = sock_alloc_file(newsock, 0, osock->sk->sk_prot_creator->name);
+ if (unlikely(IS_ERR(newfile))) {
+ err = PTR_ERR(newfile);
+ goto out_sock_alloc_fail;
+ }
+
+ newsk = sk_alloc(sock_net(osock->sk), PF_KCM, GFP_KERNEL,
+ &kcm_proto, true);
+ if (!newsk) {
+ err = -ENOMEM;
+ goto out_sk_alloc_fail;
+ }
+
+ sock_init_data(newsock, newsk);
+ init_kcm_sock(kcm_sk(newsk), kcm_sk(osock->sk)->mux);
+
+ fd_install(newfd, newfile);
+ *newsockp = newsock;
+ info->fd = newfd;
+
+ return 0;
+
+out_sk_alloc_fail:
+ fput(newfile);
+out_sock_alloc_fail:
+ put_unused_fd(newfd);
+out_fd_fail:
+ sock_release(newsock);
+out:
+ return err;
+}
+
+static int kcm_ioctl(struct socket *sock, unsigned int cmd, unsigned long arg)
+{
+ int err;
+
+ switch (cmd) {
+ case SIOCKCMATTACH: {
+ struct kcm_attach info;
+
+ if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+ err = -EFAULT;
+
+ err = kcm_attach_ioctl(sock, &info);
+
+ break;
+ }
+ case SIOCKCMUNATTACH: {
+ struct kcm_unattach info;
+
+ if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+ err = -EFAULT;
+
+ err = kcm_unattach_ioctl(sock, &info);
+
+ break;
+ }
+ case SIOCKCMCLONE: {
+ struct kcm_clone info;
+ struct socket *newsock = NULL;
+
+ if (copy_from_user(&info, (void __user *)arg, sizeof(info)))
+ err = -EFAULT;
+
+ err = kcm_clone(sock, &info, &newsock);
+
+ if (!err) {
+ if (copy_to_user((void __user *)arg, &info,
+ sizeof(info))) {
+ err = -EFAULT;
+ sock_release(newsock);
+ }
+ }
+
+ break;
+ }
+ default:
+ err = -ENOIOCTLCMD;
+ break;
+ }
+
+ return err;
+}
+
+static void free_mux(struct rcu_head *rcu)
+{
+ struct kcm_mux *mux = container_of(rcu,
+ struct kcm_mux, rcu);
+
+ kmem_cache_free(kcm_muxp, mux);
+}
+
+static void release_mux(struct kcm_mux *mux)
+{
+ struct kcm_net *knet = mux->knet;
+ struct kcm_psock *psock, *tmp_psock;
+
+ /* Release psocks */
+ list_for_each_entry_safe(psock, tmp_psock,
+ &mux->psocks, psock_list) {
+ if (!WARN_ON(psock->unattaching))
+ kcm_unattach(psock);
+ }
+
+ if (WARN_ON(mux->psocks_cnt))
+ return;
+
+ __skb_queue_purge(&mux->rx_hold_queue);
+
+ mutex_lock(&knet->mutex);
+ aggregate_mux_stats(&mux->stats, &knet->aggregate_mux_stats);
+ aggregate_psock_stats(&mux->aggregate_psock_stats,
+ &knet->aggregate_psock_stats);
+ list_del_rcu(&mux->kcm_mux_list);
+ knet->count--;
+ mutex_unlock(&knet->mutex);
+
+ call_rcu(&mux->rcu, free_mux);
+}
+
+static void kcm_done(struct kcm_sock *kcm)
+{
+ struct kcm_mux *mux = kcm->mux;
+ struct sock *sk = &kcm->sk;
+ int socks_cnt;
+
+ spin_lock_bh(&mux->rx_lock);
+ if (kcm->rx_psock) {
+ /* Cleanup in unreserve_rx_kcm */
+ WARN_ON(kcm->done);
+ kcm->rx_disabled = 1;
+ kcm->done = 1;
+ spin_unlock_bh(&mux->rx_lock);
+ return;
+ }
+
+ if (kcm->rx_wait) {
+ list_del(&kcm->wait_rx_list);
+ kcm->rx_wait = false;
+ }
+ /* Move any pending receive messages to other kcm sockets */
+ requeue_rx_msgs(mux, &sk->sk_receive_queue);
+
+ spin_unlock_bh(&mux->rx_lock);
+
+ if (WARN_ON(sk_rmem_alloc_get(sk)))
+ return;
+
+ /* Detach from MUX */
+ spin_lock_bh(&mux->lock);
+
+ list_del(&kcm->kcm_sock_list);
+ mux->kcm_socks_cnt--;
+ socks_cnt = mux->kcm_socks_cnt;
+
+ spin_unlock_bh(&mux->lock);
+
+ if (!socks_cnt) {
+ /* We are done with the mux now. */
+ release_mux(mux);
+ }
+
+ WARN_ON(kcm->rx_wait);
+
+ sock_put(&kcm->sk);
+}
+
+/* Called by kcm_release to close a KCM socket.
+ * If this is the last KCM socket on the MUX, destroy the MUX.
+ */
+static int kcm_release(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct kcm_sock *kcm;
+ struct kcm_mux *mux;
+ struct kcm_psock *psock;
+
+ if (!sk)
+ return 0;
+
+ kcm = kcm_sk(sk);
+ mux = kcm->mux;
+
+ sock_orphan(sk);
+ kfree_skb(kcm->seq_skb);
+
+ lock_sock(sk);
+ /* Purge queue under lock to avoid race condition with tx_work trying
+ * to act when queue is nonempty. If tx_work runs after this point
+ * it will just return.
+ */
+ __skb_queue_purge(&sk->sk_write_queue);
+ release_sock(sk);
+
+ spin_lock_bh(&mux->lock);
+ if (kcm->tx_wait) {
+ /* Take of tx_wait list, after this point there should be no way
+ * that a psock will be assigned to this kcm.
+ */
+ list_del(&kcm->wait_psock_list);
+ kcm->tx_wait = false;
+ }
+ spin_unlock_bh(&mux->lock);
+
+ /* Cancel work. After this point there should be no outside references
+ * to the kcm socket.
+ */
+ cancel_work_sync(&kcm->tx_work);
+
+ lock_sock(sk);
+ psock = kcm->tx_psock;
+ if (psock) {
+ /* A psock was reserved, so we need to kill it since it
+ * may already have some bytes queued from a message. We
+ * need to do this after removing kcm from tx_wait list.
+ */
+ kcm_abort_tx_psock(psock, EPIPE, false);
+ unreserve_psock(kcm);
+ }
+ release_sock(sk);
+
+ WARN_ON(kcm->tx_wait);
+ WARN_ON(kcm->tx_psock);
+
+ sock->sk = NULL;
+
+ kcm_done(kcm);
+
+ return 0;
+}
+
+static const struct proto_ops kcm_dgram_ops = {
+ .family = PF_KCM,
+ .owner = THIS_MODULE,
+ .release = kcm_release,
+ .bind = sock_no_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = datagram_poll,
+ .ioctl = kcm_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = kcm_setsockopt,
+ .getsockopt = kcm_getsockopt,
+ .sendmsg = kcm_sendmsg,
+ .recvmsg = kcm_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = kcm_sendpage,
+};
+
+static const struct proto_ops kcm_seqpacket_ops = {
+ .family = PF_KCM,
+ .owner = THIS_MODULE,
+ .release = kcm_release,
+ .bind = sock_no_bind,
+ .connect = sock_no_connect,
+ .socketpair = sock_no_socketpair,
+ .accept = sock_no_accept,
+ .getname = sock_no_getname,
+ .poll = datagram_poll,
+ .ioctl = kcm_ioctl,
+ .listen = sock_no_listen,
+ .shutdown = sock_no_shutdown,
+ .setsockopt = kcm_setsockopt,
+ .getsockopt = kcm_getsockopt,
+ .sendmsg = kcm_sendmsg,
+ .recvmsg = kcm_recvmsg,
+ .mmap = sock_no_mmap,
+ .sendpage = kcm_sendpage,
+ .splice_read = kcm_splice_read,
+};
+
+/* Create proto operation for kcm sockets */
+static int kcm_create(struct net *net, struct socket *sock,
+ int protocol, int kern)
+{
+ struct kcm_net *knet = net_generic(net, kcm_net_id);
+ struct sock *sk;
+ struct kcm_mux *mux;
+
+ switch (sock->type) {
+ case SOCK_DGRAM:
+ sock->ops = &kcm_dgram_ops;
+ break;
+ case SOCK_SEQPACKET:
+ sock->ops = &kcm_seqpacket_ops;
+ break;
+ default:
+ return -ESOCKTNOSUPPORT;
+ }
+
+ if (protocol != KCMPROTO_CONNECTED)
+ return -EPROTONOSUPPORT;
+
+ sk = sk_alloc(net, PF_KCM, GFP_KERNEL, &kcm_proto, kern);
+ if (!sk)
+ return -ENOMEM;
+
+ /* Allocate a kcm mux, shared between KCM sockets */
+ mux = kmem_cache_zalloc(kcm_muxp, GFP_KERNEL);
+ if (!mux) {
+ sk_free(sk);
+ return -ENOMEM;
+ }
+
+ spin_lock_init(&mux->lock);
+ spin_lock_init(&mux->rx_lock);
+ INIT_LIST_HEAD(&mux->kcm_socks);
+ INIT_LIST_HEAD(&mux->kcm_rx_waiters);
+ INIT_LIST_HEAD(&mux->kcm_tx_waiters);
+
+ INIT_LIST_HEAD(&mux->psocks);
+ INIT_LIST_HEAD(&mux->psocks_ready);
+ INIT_LIST_HEAD(&mux->psocks_avail);
+
+ mux->knet = knet;
+
+ /* Add new MUX to list */
+ mutex_lock(&knet->mutex);
+ list_add_rcu(&mux->kcm_mux_list, &knet->mux_list);
+ knet->count++;
+ mutex_unlock(&knet->mutex);
+
+ skb_queue_head_init(&mux->rx_hold_queue);
+
+ /* Init KCM socket */
+ sock_init_data(sock, sk);
+ init_kcm_sock(kcm_sk(sk), mux);
+
+ return 0;
+}
+
+static struct net_proto_family kcm_family_ops = {
+ .family = PF_KCM,
+ .create = kcm_create,
+ .owner = THIS_MODULE,
+};
+
+static __net_init int kcm_init_net(struct net *net)
+{
+ struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+ INIT_LIST_HEAD_RCU(&knet->mux_list);
+ mutex_init(&knet->mutex);
+
+ return 0;
+}
+
+static __net_exit void kcm_exit_net(struct net *net)
+{
+ struct kcm_net *knet = net_generic(net, kcm_net_id);
+
+ /* All KCM sockets should be closed at this point, which should mean
+ * that all multiplexors and psocks have been destroyed.
+ */
+ WARN_ON(!list_empty(&knet->mux_list));
+}
+
+static struct pernet_operations kcm_net_ops = {
+ .init = kcm_init_net,
+ .exit = kcm_exit_net,
+ .id = &kcm_net_id,
+ .size = sizeof(struct kcm_net),
+};
+
+static int __init kcm_init(void)
+{
+ int err = -ENOMEM;
+
+ kcm_muxp = kmem_cache_create("kcm_mux_cache",
+ sizeof(struct kcm_mux), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ if (!kcm_muxp)
+ goto fail;
+
+ kcm_psockp = kmem_cache_create("kcm_psock_cache",
+ sizeof(struct kcm_psock), 0,
+ SLAB_HWCACHE_ALIGN | SLAB_PANIC, NULL);
+ if (!kcm_psockp)
+ goto fail;
+
+ kcm_wq = create_singlethread_workqueue("kkcmd");
+ if (!kcm_wq)
+ goto fail;
+
+ err = proto_register(&kcm_proto, 1);
+ if (err)
+ goto fail;
+
+ err = sock_register(&kcm_family_ops);
+ if (err)
+ goto sock_register_fail;
+
+ err = register_pernet_device(&kcm_net_ops);
+ if (err)
+ goto net_ops_fail;
+
+ err = kcm_proc_init();
+ if (err)
+ goto proc_init_fail;
+
+ return 0;
+
+proc_init_fail:
+ unregister_pernet_device(&kcm_net_ops);
+
+net_ops_fail:
+ sock_unregister(PF_KCM);
+
+sock_register_fail:
+ proto_unregister(&kcm_proto);
+
+fail:
+ kmem_cache_destroy(kcm_muxp);
+ kmem_cache_destroy(kcm_psockp);
+
+ if (kcm_wq)
+ destroy_workqueue(kcm_wq);
+
+ return err;
+}
+
+static void __exit kcm_exit(void)
+{
+ kcm_proc_exit();
+ unregister_pernet_device(&kcm_net_ops);
+ sock_unregister(PF_KCM);
+ proto_unregister(&kcm_proto);
+ destroy_workqueue(kcm_wq);
+
+ kmem_cache_destroy(kcm_muxp);
+ kmem_cache_destroy(kcm_psockp);
+}
+
+module_init(kcm_init);
+module_exit(kcm_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_NETPROTO(PF_KCM);
+
diff --git a/net/l2tp/l2tp_ip.c b/net/l2tp/l2tp_ip.c
index ec22078b0914..42de4ccd159f 100644
--- a/net/l2tp/l2tp_ip.c
+++ b/net/l2tp/l2tp_ip.c
@@ -123,12 +123,11 @@ static int l2tp_ip_recv(struct sk_buff *skb)
struct l2tp_tunnel *tunnel = NULL;
int length;
- /* Point to L2TP header */
- optr = ptr = skb->data;
-
if (!pskb_may_pull(skb, 4))
goto discard;
+ /* Point to L2TP header */
+ optr = ptr = skb->data;
session_id = ntohl(*((__be32 *) ptr));
ptr += 4;
@@ -156,6 +155,9 @@ static int l2tp_ip_recv(struct sk_buff *skb)
if (!pskb_may_pull(skb, length))
goto discard;
+ /* Point to L2TP header */
+ optr = ptr = skb->data;
+ ptr += 4;
pr_debug("%s: ip recv\n", tunnel->name);
print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
}
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index aca38d8aed8e..cd479903d943 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -25,6 +25,7 @@
#include <net/udp.h>
#include <net/inet_common.h>
#include <net/inet_hashtables.h>
+#include <net/inet6_hashtables.h>
#include <net/tcp_states.h>
#include <net/protocol.h>
#include <net/xfrm.h>
@@ -135,12 +136,11 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
struct l2tp_tunnel *tunnel = NULL;
int length;
- /* Point to L2TP header */
- optr = ptr = skb->data;
-
if (!pskb_may_pull(skb, 4))
goto discard;
+ /* Point to L2TP header */
+ optr = ptr = skb->data;
session_id = ntohl(*((__be32 *) ptr));
ptr += 4;
@@ -168,6 +168,9 @@ static int l2tp_ip6_recv(struct sk_buff *skb)
if (!pskb_may_pull(skb, length))
goto discard;
+ /* Point to L2TP header */
+ optr = ptr = skb->data;
+ ptr += 4;
pr_debug("%s: ip recv\n", tunnel->name);
print_hex_dump_bytes("", DUMP_PREFIX_OFFSET, ptr, length);
}
@@ -486,6 +489,7 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
DECLARE_SOCKADDR(struct sockaddr_l2tpip6 *, lsa, msg->msg_name);
struct in6_addr *daddr, *final_p, final;
struct ipv6_pinfo *np = inet6_sk(sk);
+ struct ipv6_txoptions *opt_to_free = NULL;
struct ipv6_txoptions *opt = NULL;
struct ip6_flowlabel *flowlabel = NULL;
struct dst_entry *dst = NULL;
@@ -575,8 +579,10 @@ static int l2tp_ip6_sendmsg(struct sock *sk, struct msghdr *msg, size_t len)
opt = NULL;
}
- if (opt == NULL)
- opt = np->opt;
+ if (!opt) {
+ opt = txopt_get(np);
+ opt_to_free = opt;
+ }
if (flowlabel)
opt = fl6_merge_options(&opt_space, flowlabel, opt);
opt = ipv6_fixup_options(&opt_space, opt);
@@ -631,6 +637,7 @@ done:
dst_release(dst);
out:
fl6_sock_release(flowlabel);
+ txopt_put(opt_to_free);
return err < 0 ? err : len;
@@ -714,7 +721,7 @@ static struct proto l2tp_ip6_prot = {
.sendmsg = l2tp_ip6_sendmsg,
.recvmsg = l2tp_ip6_recvmsg,
.backlog_rcv = l2tp_ip6_backlog_recv,
- .hash = inet_hash,
+ .hash = inet6_hash,
.unhash = inet_unhash,
.obj_size = sizeof(struct l2tp_ip6_sock),
#ifdef CONFIG_COMPAT
diff --git a/net/l2tp/l2tp_netlink.c b/net/l2tp/l2tp_netlink.c
index f93c5be612a7..2caaa84ce92d 100644
--- a/net/l2tp/l2tp_netlink.c
+++ b/net/l2tp/l2tp_netlink.c
@@ -124,8 +124,13 @@ static int l2tp_tunnel_notify(struct genl_family *family,
ret = l2tp_nl_tunnel_send(msg, info->snd_portid, info->snd_seq,
NLM_F_ACK, tunnel, cmd);
- if (ret >= 0)
- return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+ if (ret >= 0) {
+ ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+ /* We don't care if no one is listening */
+ if (ret == -ESRCH)
+ ret = 0;
+ return ret;
+ }
nlmsg_free(msg);
@@ -147,8 +152,13 @@ static int l2tp_session_notify(struct genl_family *family,
ret = l2tp_nl_session_send(msg, info->snd_portid, info->snd_seq,
NLM_F_ACK, session, cmd);
- if (ret >= 0)
- return genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+ if (ret >= 0) {
+ ret = genlmsg_multicast_allns(family, msg, 0, 0, GFP_ATOMIC);
+ /* We don't care if no one is listening */
+ if (ret == -ESRCH)
+ ret = 0;
+ return ret;
+ }
nlmsg_free(msg);
diff --git a/net/l2tp/l2tp_ppp.c b/net/l2tp/l2tp_ppp.c
index 1ad18c55064c..652c250b9a3b 100644
--- a/net/l2tp/l2tp_ppp.c
+++ b/net/l2tp/l2tp_ppp.c
@@ -230,26 +230,11 @@ static void pppol2tp_recv(struct l2tp_session *session, struct sk_buff *skb, int
if (sk->sk_state & PPPOX_BOUND) {
struct pppox_sock *po;
+
l2tp_dbg(session, PPPOL2TP_MSG_DATA,
"%s: recv %d byte data frame, passing to ppp\n",
session->name, data_len);
- /* We need to forget all info related to the L2TP packet
- * gathered in the skb as we are going to reuse the same
- * skb for the inner packet.
- * Namely we need to:
- * - reset xfrm (IPSec) information as it applies to
- * the outer L2TP packet and not to the inner one
- * - release the dst to force a route lookup on the inner
- * IP packet since skb->dst currently points to the dst
- * of the UDP tunnel
- * - reset netfilter information as it doesn't apply
- * to the inner packet either
- */
- secpath_reset(skb);
- skb_dst_drop(skb);
- nf_reset(skb);
-
po = pppox_sk(sk);
ppp_input(&po->chan, skb);
} else {
@@ -1862,5 +1847,5 @@ MODULE_AUTHOR("James Chapman <jchapman@katalix.com>");
MODULE_DESCRIPTION("PPP over L2TP over UDP");
MODULE_LICENSE("GPL");
MODULE_VERSION(PPPOL2TP_DRV_VERSION);
-MODULE_ALIAS("pppox-proto-" __stringify(PX_PROTO_OL2TP));
+MODULE_ALIAS_NET_PF_PROTO(PF_PPPOX, PX_PROTO_OL2TP);
MODULE_ALIAS_L2TP_PWTYPE(11);
diff --git a/net/l3mdev/l3mdev.c b/net/l3mdev/l3mdev.c
index 8e5ead366e7f..e925037fa0df 100644
--- a/net/l3mdev/l3mdev.c
+++ b/net/l3mdev/l3mdev.c
@@ -17,7 +17,7 @@
* @dev: targeted interface
*/
-int l3mdev_master_ifindex_rcu(struct net_device *dev)
+int l3mdev_master_ifindex_rcu(const struct net_device *dev)
{
int ifindex = 0;
@@ -28,8 +28,15 @@ int l3mdev_master_ifindex_rcu(struct net_device *dev)
ifindex = dev->ifindex;
} else if (netif_is_l3_slave(dev)) {
struct net_device *master;
+ struct net_device *_dev = (struct net_device *)dev;
- master = netdev_master_upper_dev_get_rcu(dev);
+ /* netdev_master_upper_dev_get_rcu calls
+ * list_first_or_null_rcu to walk the upper dev list.
+ * list_first_or_null_rcu does not handle a const arg. We aren't
+ * making changes, just want the master device from that list so
+ * typecast to remove the const
+ */
+ master = netdev_master_upper_dev_get_rcu(_dev);
if (master)
ifindex = master->ifindex;
}
diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
index 8dab4e569571..b3c52e3f689a 100644
--- a/net/llc/af_llc.c
+++ b/net/llc/af_llc.c
@@ -38,7 +38,7 @@ static u16 llc_ui_sap_link_no_max[256];
static struct sockaddr_llc llc_ui_addrnull;
static const struct proto_ops llc_ui_ops;
-static int llc_ui_wait_for_conn(struct sock *sk, long timeout);
+static long llc_ui_wait_for_conn(struct sock *sk, long timeout);
static int llc_ui_wait_for_disc(struct sock *sk, long timeout);
static int llc_ui_wait_for_busy_core(struct sock *sk, long timeout);
@@ -551,7 +551,7 @@ static int llc_ui_wait_for_disc(struct sock *sk, long timeout)
return rc;
}
-static int llc_ui_wait_for_conn(struct sock *sk, long timeout)
+static long llc_ui_wait_for_conn(struct sock *sk, long timeout)
{
DEFINE_WAIT(wait);
diff --git a/net/mac80211/agg-rx.c b/net/mac80211/agg-rx.c
index 10ad4ac1fa0b..3a8f881b22f1 100644
--- a/net/mac80211/agg-rx.c
+++ b/net/mac80211/agg-rx.c
@@ -7,6 +7,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2007-2010, Intel Corporation
+ * Copyright(c) 2015 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -61,16 +62,25 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
{
struct ieee80211_local *local = sta->local;
struct tid_ampdu_rx *tid_rx;
+ struct ieee80211_ampdu_params params = {
+ .sta = &sta->sta,
+ .action = IEEE80211_AMPDU_RX_STOP,
+ .tid = tid,
+ .amsdu = false,
+ .timeout = 0,
+ .ssn = 0,
+ };
lockdep_assert_held(&sta->ampdu_mlme.mtx);
tid_rx = rcu_dereference_protected(sta->ampdu_mlme.tid_rx[tid],
lockdep_is_held(&sta->ampdu_mlme.mtx));
- if (!tid_rx)
+ if (!test_bit(tid, sta->ampdu_mlme.agg_session_valid))
return;
RCU_INIT_POINTER(sta->ampdu_mlme.tid_rx[tid], NULL);
+ __clear_bit(tid, sta->ampdu_mlme.agg_session_valid);
ht_dbg(sta->sdata,
"Rx BA session stop requested for %pM tid %u %s reason: %d\n",
@@ -78,8 +88,7 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
initiator == WLAN_BACK_RECIPIENT ? "recipient" : "inititator",
(int)reason);
- if (drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_STOP,
- &sta->sta, tid, NULL, 0, false))
+ if (drv_ampdu_action(local, sta->sdata, &params))
sdata_info(sta->sdata,
"HW problem - can not stop rx aggregation for %pM tid %d\n",
sta->sta.addr, tid);
@@ -89,6 +98,13 @@ void ___ieee80211_stop_rx_ba_session(struct sta_info *sta, u16 tid,
ieee80211_send_delba(sta->sdata, sta->sta.addr,
tid, WLAN_BACK_RECIPIENT, reason);
+ /*
+ * return here in case tid_rx is not assigned - which will happen if
+ * IEEE80211_HW_SUPPORTS_REORDERING_BUFFER is set.
+ */
+ if (!tid_rx)
+ return;
+
del_timer_sync(&tid_rx->session_timer);
/* make sure ieee80211_sta_reorder_release() doesn't re-arm the timer */
@@ -237,6 +253,15 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
{
struct ieee80211_local *local = sta->sdata->local;
struct tid_ampdu_rx *tid_agg_rx;
+ struct ieee80211_ampdu_params params = {
+ .sta = &sta->sta,
+ .action = IEEE80211_AMPDU_RX_START,
+ .tid = tid,
+ .amsdu = false,
+ .timeout = timeout,
+ .ssn = start_seq_num,
+ };
+
int i, ret = -EOPNOTSUPP;
u16 status = WLAN_STATUS_REQUEST_DECLINED;
@@ -275,11 +300,12 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
/* make sure the size doesn't exceed the maximum supported by the hw */
if (buf_size > local->hw.max_rx_aggregation_subframes)
buf_size = local->hw.max_rx_aggregation_subframes;
+ params.buf_size = buf_size;
/* examine state machine */
mutex_lock(&sta->ampdu_mlme.mtx);
- if (sta->ampdu_mlme.tid_rx[tid]) {
+ if (test_bit(tid, sta->ampdu_mlme.agg_session_valid)) {
ht_dbg_ratelimited(sta->sdata,
"unexpected AddBA Req from %pM on tid %u\n",
sta->sta.addr, tid);
@@ -290,8 +316,18 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
false);
}
+ if (ieee80211_hw_check(&local->hw, SUPPORTS_REORDERING_BUFFER)) {
+ ret = drv_ampdu_action(local, sta->sdata, &params);
+ ht_dbg(sta->sdata,
+ "Rx A-MPDU request on %pM tid %d result %d\n",
+ sta->sta.addr, tid, ret);
+ if (!ret)
+ status = WLAN_STATUS_SUCCESS;
+ goto end;
+ }
+
/* prepare A-MPDU MLME for Rx aggregation */
- tid_agg_rx = kmalloc(sizeof(struct tid_ampdu_rx), GFP_KERNEL);
+ tid_agg_rx = kzalloc(sizeof(*tid_agg_rx), GFP_KERNEL);
if (!tid_agg_rx)
goto end;
@@ -322,8 +358,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
for (i = 0; i < buf_size; i++)
__skb_queue_head_init(&tid_agg_rx->reorder_buf[i]);
- ret = drv_ampdu_action(local, sta->sdata, IEEE80211_AMPDU_RX_START,
- &sta->sta, tid, &start_seq_num, 0, false);
+ ret = drv_ampdu_action(local, sta->sdata, &params);
ht_dbg(sta->sdata, "Rx A-MPDU request on %pM tid %d result %d\n",
sta->sta.addr, tid, ret);
if (ret) {
@@ -341,6 +376,7 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
tid_agg_rx->timeout = timeout;
tid_agg_rx->stored_mpdu_num = 0;
tid_agg_rx->auto_seq = auto_seq;
+ tid_agg_rx->reorder_buf_filtered = 0;
status = WLAN_STATUS_SUCCESS;
/* activate it for RX */
@@ -352,6 +388,8 @@ void __ieee80211_start_rx_ba_session(struct sta_info *sta,
}
end:
+ if (status == WLAN_STATUS_SUCCESS)
+ __set_bit(tid, sta->ampdu_mlme.agg_session_valid);
mutex_unlock(&sta->ampdu_mlme.mtx);
end_no_lock:
diff --git a/net/mac80211/agg-tx.c b/net/mac80211/agg-tx.c
index a758eb84e8f0..4932e9f243a2 100644
--- a/net/mac80211/agg-tx.c
+++ b/net/mac80211/agg-tx.c
@@ -7,6 +7,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2007-2010, Intel Corporation
+ * Copyright(c) 2015 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -295,7 +296,14 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
{
struct ieee80211_local *local = sta->local;
struct tid_ampdu_tx *tid_tx;
- enum ieee80211_ampdu_mlme_action action;
+ struct ieee80211_ampdu_params params = {
+ .sta = &sta->sta,
+ .tid = tid,
+ .buf_size = 0,
+ .amsdu = false,
+ .timeout = 0,
+ .ssn = 0,
+ };
int ret;
lockdep_assert_held(&sta->ampdu_mlme.mtx);
@@ -304,10 +312,10 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
case AGG_STOP_DECLINED:
case AGG_STOP_LOCAL_REQUEST:
case AGG_STOP_PEER_REQUEST:
- action = IEEE80211_AMPDU_TX_STOP_CONT;
+ params.action = IEEE80211_AMPDU_TX_STOP_CONT;
break;
case AGG_STOP_DESTROY_STA:
- action = IEEE80211_AMPDU_TX_STOP_FLUSH;
+ params.action = IEEE80211_AMPDU_TX_STOP_FLUSH;
break;
default:
WARN_ON_ONCE(1);
@@ -330,9 +338,8 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
spin_unlock_bh(&sta->lock);
if (reason != AGG_STOP_DESTROY_STA)
return -EALREADY;
- ret = drv_ampdu_action(local, sta->sdata,
- IEEE80211_AMPDU_TX_STOP_FLUSH_CONT,
- &sta->sta, tid, NULL, 0, false);
+ params.action = IEEE80211_AMPDU_TX_STOP_FLUSH_CONT;
+ ret = drv_ampdu_action(local, sta->sdata, &params);
WARN_ON_ONCE(ret);
return 0;
}
@@ -381,8 +388,7 @@ int ___ieee80211_stop_tx_ba_session(struct sta_info *sta, u16 tid,
WLAN_BACK_INITIATOR;
tid_tx->tx_stop = reason == AGG_STOP_LOCAL_REQUEST;
- ret = drv_ampdu_action(local, sta->sdata, action,
- &sta->sta, tid, NULL, 0, false);
+ ret = drv_ampdu_action(local, sta->sdata, &params);
/* HW shall not deny going back to legacy */
if (WARN_ON(ret)) {
@@ -445,7 +451,14 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
struct tid_ampdu_tx *tid_tx;
struct ieee80211_local *local = sta->local;
struct ieee80211_sub_if_data *sdata = sta->sdata;
- u16 start_seq_num;
+ struct ieee80211_ampdu_params params = {
+ .sta = &sta->sta,
+ .action = IEEE80211_AMPDU_TX_START,
+ .tid = tid,
+ .buf_size = 0,
+ .amsdu = false,
+ .timeout = 0,
+ };
int ret;
tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
@@ -467,10 +480,8 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
*/
synchronize_net();
- start_seq_num = sta->tid_seq[tid] >> 4;
-
- ret = drv_ampdu_action(local, sdata, IEEE80211_AMPDU_TX_START,
- &sta->sta, tid, &start_seq_num, 0, false);
+ params.ssn = sta->tid_seq[tid] >> 4;
+ ret = drv_ampdu_action(local, sdata, &params);
if (ret) {
ht_dbg(sdata,
"BA request denied - HW unavailable for %pM tid %d\n",
@@ -499,8 +510,8 @@ void ieee80211_tx_ba_session_handle_start(struct sta_info *sta, int tid)
/* send AddBA request */
ieee80211_send_addba_request(sdata, sta->sta.addr, tid,
- tid_tx->dialog_token, start_seq_num,
- local->hw.max_tx_aggregation_subframes,
+ tid_tx->dialog_token, params.ssn,
+ IEEE80211_MAX_AMPDU_BUF,
tid_tx->timeout);
}
@@ -684,18 +695,24 @@ static void ieee80211_agg_tx_operational(struct ieee80211_local *local,
struct sta_info *sta, u16 tid)
{
struct tid_ampdu_tx *tid_tx;
+ struct ieee80211_ampdu_params params = {
+ .sta = &sta->sta,
+ .action = IEEE80211_AMPDU_TX_OPERATIONAL,
+ .tid = tid,
+ .timeout = 0,
+ .ssn = 0,
+ };
lockdep_assert_held(&sta->ampdu_mlme.mtx);
tid_tx = rcu_dereference_protected_tid_tx(sta, tid);
+ params.buf_size = tid_tx->buf_size;
+ params.amsdu = tid_tx->amsdu;
ht_dbg(sta->sdata, "Aggregation is on for %pM tid %d\n",
sta->sta.addr, tid);
- drv_ampdu_action(local, sta->sdata,
- IEEE80211_AMPDU_TX_OPERATIONAL,
- &sta->sta, tid, NULL, tid_tx->buf_size,
- tid_tx->amsdu);
+ drv_ampdu_action(local, sta->sdata, &params);
/*
* synchronize with TX path, while splicing the TX path
@@ -926,6 +943,7 @@ void ieee80211_process_addba_resp(struct ieee80211_local *local,
amsdu = capab & IEEE80211_ADDBA_PARAM_AMSDU_MASK;
tid = (capab & IEEE80211_ADDBA_PARAM_TID_MASK) >> 2;
buf_size = (capab & IEEE80211_ADDBA_PARAM_BUF_SIZE_MASK) >> 6;
+ buf_size = min(buf_size, local->hw.max_tx_aggregation_subframes);
mutex_lock(&sta->ampdu_mlme.mtx);
diff --git a/net/mac80211/cfg.c b/net/mac80211/cfg.c
index c2bd1b6a6922..fe1704c4e8fb 100644
--- a/net/mac80211/cfg.c
+++ b/net/mac80211/cfg.c
@@ -339,8 +339,9 @@ static int ieee80211_get_key(struct wiphy *wiphy, struct net_device *dev,
switch (key->conf.cipher) {
case WLAN_CIPHER_SUITE_TKIP:
- iv32 = key->u.tkip.tx.iv32;
- iv16 = key->u.tkip.tx.iv16;
+ pn64 = atomic64_read(&key->conf.tx_pn);
+ iv32 = TKIP_PN_TO_IV32(pn64);
+ iv16 = TKIP_PN_TO_IV16(pn64);
if (key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE &&
!(key->conf.flags & IEEE80211_KEY_FLAG_GENERATE_IV)) {
@@ -1131,6 +1132,34 @@ static int sta_apply_parameters(struct ieee80211_local *local,
sta->sta.max_sp = params->max_sp;
}
+ /* The sender might not have sent the last bit, consider it to be 0 */
+ if (params->ext_capab_len >= 8) {
+ u8 val = (params->ext_capab[7] &
+ WLAN_EXT_CAPA8_MAX_MSDU_IN_AMSDU_LSB) >> 7;
+
+ /* we did get all the bits, take the MSB as well */
+ if (params->ext_capab_len >= 9) {
+ u8 val_msb = params->ext_capab[8] &
+ WLAN_EXT_CAPA9_MAX_MSDU_IN_AMSDU_MSB;
+ val_msb <<= 1;
+ val |= val_msb;
+ }
+
+ switch (val) {
+ case 1:
+ sta->sta.max_amsdu_subframes = 32;
+ break;
+ case 2:
+ sta->sta.max_amsdu_subframes = 16;
+ break;
+ case 3:
+ sta->sta.max_amsdu_subframes = 8;
+ break;
+ default:
+ sta->sta.max_amsdu_subframes = 0;
+ }
+ }
+
/*
* cfg80211 validates this (1-2007) and allows setting the AID
* only when creating a new station entry
@@ -1160,6 +1189,7 @@ static int sta_apply_parameters(struct ieee80211_local *local,
ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
params->ht_capa, sta);
+ /* VHT can override some HT caps such as the A-MSDU max length */
if (params->vht_capa)
ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
params->vht_capa, sta);
@@ -1169,8 +1199,7 @@ static int sta_apply_parameters(struct ieee80211_local *local,
* rc isn't initialized here yet, so ignore it
*/
__ieee80211_vht_handle_opmode(sdata, sta,
- params->opmode_notif,
- band, false);
+ params->opmode_notif, band);
}
if (ieee80211_vif_is_mesh(&sdata->vif))
@@ -1216,16 +1245,6 @@ static int ieee80211_add_station(struct wiphy *wiphy, struct net_device *dev,
if (!sta)
return -ENOMEM;
- /*
- * defaults -- if userspace wants something else we'll
- * change it accordingly in sta_apply_parameters()
- */
- if (!(params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER)) &&
- !(params->sta_flags_set & (BIT(NL80211_STA_FLAG_AUTHENTICATED) |
- BIT(NL80211_STA_FLAG_ASSOCIATED)))) {
- sta_info_pre_move_state(sta, IEEE80211_STA_AUTH);
- sta_info_pre_move_state(sta, IEEE80211_STA_ASSOC);
- }
if (params->sta_flags_set & BIT(NL80211_STA_FLAG_TDLS_PEER))
sta->sta.tdls = true;
@@ -1994,6 +2013,11 @@ static int ieee80211_scan(struct wiphy *wiphy,
return ieee80211_request_scan(sdata, req);
}
+static void ieee80211_abort_scan(struct wiphy *wiphy, struct wireless_dev *wdev)
+{
+ ieee80211_scan_cancel(wiphy_priv(wiphy));
+}
+
static int
ieee80211_sched_scan_start(struct wiphy *wiphy,
struct net_device *dev,
@@ -2509,294 +2533,6 @@ static int ieee80211_set_bitrate_mask(struct wiphy *wiphy,
return 0;
}
-static bool ieee80211_coalesce_started_roc(struct ieee80211_local *local,
- struct ieee80211_roc_work *new_roc,
- struct ieee80211_roc_work *cur_roc)
-{
- unsigned long now = jiffies;
- unsigned long remaining = cur_roc->hw_start_time +
- msecs_to_jiffies(cur_roc->duration) -
- now;
-
- if (WARN_ON(!cur_roc->started || !cur_roc->hw_begun))
- return false;
-
- /* if it doesn't fit entirely, schedule a new one */
- if (new_roc->duration > jiffies_to_msecs(remaining))
- return false;
-
- ieee80211_handle_roc_started(new_roc);
-
- /* add to dependents so we send the expired event properly */
- list_add_tail(&new_roc->list, &cur_roc->dependents);
- return true;
-}
-
-static u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local)
-{
- lockdep_assert_held(&local->mtx);
-
- local->roc_cookie_counter++;
-
- /* wow, you wrapped 64 bits ... more likely a bug */
- if (WARN_ON(local->roc_cookie_counter == 0))
- local->roc_cookie_counter++;
-
- return local->roc_cookie_counter;
-}
-
-static int ieee80211_start_roc_work(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata,
- struct ieee80211_channel *channel,
- unsigned int duration, u64 *cookie,
- struct sk_buff *txskb,
- enum ieee80211_roc_type type)
-{
- struct ieee80211_roc_work *roc, *tmp;
- bool queued = false;
- int ret;
-
- lockdep_assert_held(&local->mtx);
-
- if (local->use_chanctx && !local->ops->remain_on_channel)
- return -EOPNOTSUPP;
-
- roc = kzalloc(sizeof(*roc), GFP_KERNEL);
- if (!roc)
- return -ENOMEM;
-
- /*
- * If the duration is zero, then the driver
- * wouldn't actually do anything. Set it to
- * 10 for now.
- *
- * TODO: cancel the off-channel operation
- * when we get the SKB's TX status and
- * the wait time was zero before.
- */
- if (!duration)
- duration = 10;
-
- roc->chan = channel;
- roc->duration = duration;
- roc->req_duration = duration;
- roc->frame = txskb;
- roc->type = type;
- roc->sdata = sdata;
- INIT_DELAYED_WORK(&roc->work, ieee80211_sw_roc_work);
- INIT_LIST_HEAD(&roc->dependents);
-
- /*
- * cookie is either the roc cookie (for normal roc)
- * or the SKB (for mgmt TX)
- */
- if (!txskb) {
- roc->cookie = ieee80211_mgmt_tx_cookie(local);
- *cookie = roc->cookie;
- } else {
- roc->mgmt_tx_cookie = *cookie;
- }
-
- /* if there's one pending or we're scanning, queue this one */
- if (!list_empty(&local->roc_list) ||
- local->scanning || ieee80211_is_radar_required(local))
- goto out_check_combine;
-
- /* if not HW assist, just queue & schedule work */
- if (!local->ops->remain_on_channel) {
- ieee80211_queue_delayed_work(&local->hw, &roc->work, 0);
- goto out_queue;
- }
-
- /* otherwise actually kick it off here (for error handling) */
-
- ret = drv_remain_on_channel(local, sdata, channel, duration, type);
- if (ret) {
- kfree(roc);
- return ret;
- }
-
- roc->started = true;
- goto out_queue;
-
- out_check_combine:
- list_for_each_entry(tmp, &local->roc_list, list) {
- if (tmp->chan != channel || tmp->sdata != sdata)
- continue;
-
- /*
- * Extend this ROC if possible:
- *
- * If it hasn't started yet, just increase the duration
- * and add the new one to the list of dependents.
- * If the type of the new ROC has higher priority, modify the
- * type of the previous one to match that of the new one.
- */
- if (!tmp->started) {
- list_add_tail(&roc->list, &tmp->dependents);
- tmp->duration = max(tmp->duration, roc->duration);
- tmp->type = max(tmp->type, roc->type);
- queued = true;
- break;
- }
-
- /* If it has already started, it's more difficult ... */
- if (local->ops->remain_on_channel) {
- /*
- * In the offloaded ROC case, if it hasn't begun, add
- * this new one to the dependent list to be handled
- * when the master one begins. If it has begun,
- * check if it fits entirely within the existing one,
- * in which case it will just be dependent as well.
- * Otherwise, schedule it by itself.
- */
- if (!tmp->hw_begun) {
- list_add_tail(&roc->list, &tmp->dependents);
- queued = true;
- break;
- }
-
- if (ieee80211_coalesce_started_roc(local, roc, tmp))
- queued = true;
- } else if (del_timer_sync(&tmp->work.timer)) {
- unsigned long new_end;
-
- /*
- * In the software ROC case, cancel the timer, if
- * that fails then the finish work is already
- * queued/pending and thus we queue the new ROC
- * normally, if that succeeds then we can extend
- * the timer duration and TX the frame (if any.)
- */
-
- list_add_tail(&roc->list, &tmp->dependents);
- queued = true;
-
- new_end = jiffies + msecs_to_jiffies(roc->duration);
-
- /* ok, it was started & we canceled timer */
- if (time_after(new_end, tmp->work.timer.expires))
- mod_timer(&tmp->work.timer, new_end);
- else
- add_timer(&tmp->work.timer);
-
- ieee80211_handle_roc_started(roc);
- }
- break;
- }
-
- out_queue:
- if (!queued)
- list_add_tail(&roc->list, &local->roc_list);
-
- return 0;
-}
-
-static int ieee80211_remain_on_channel(struct wiphy *wiphy,
- struct wireless_dev *wdev,
- struct ieee80211_channel *chan,
- unsigned int duration,
- u64 *cookie)
-{
- struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
- struct ieee80211_local *local = sdata->local;
- int ret;
-
- mutex_lock(&local->mtx);
- ret = ieee80211_start_roc_work(local, sdata, chan,
- duration, cookie, NULL,
- IEEE80211_ROC_TYPE_NORMAL);
- mutex_unlock(&local->mtx);
-
- return ret;
-}
-
-static int ieee80211_cancel_roc(struct ieee80211_local *local,
- u64 cookie, bool mgmt_tx)
-{
- struct ieee80211_roc_work *roc, *tmp, *found = NULL;
- int ret;
-
- mutex_lock(&local->mtx);
- list_for_each_entry_safe(roc, tmp, &local->roc_list, list) {
- struct ieee80211_roc_work *dep, *tmp2;
-
- list_for_each_entry_safe(dep, tmp2, &roc->dependents, list) {
- if (!mgmt_tx && dep->cookie != cookie)
- continue;
- else if (mgmt_tx && dep->mgmt_tx_cookie != cookie)
- continue;
- /* found dependent item -- just remove it */
- list_del(&dep->list);
- mutex_unlock(&local->mtx);
-
- ieee80211_roc_notify_destroy(dep, true);
- return 0;
- }
-
- if (!mgmt_tx && roc->cookie != cookie)
- continue;
- else if (mgmt_tx && roc->mgmt_tx_cookie != cookie)
- continue;
-
- found = roc;
- break;
- }
-
- if (!found) {
- mutex_unlock(&local->mtx);
- return -ENOENT;
- }
-
- /*
- * We found the item to cancel, so do that. Note that it
- * may have dependents, which we also cancel (and send
- * the expired signal for.) Not doing so would be quite
- * tricky here, but we may need to fix it later.
- */
-
- if (local->ops->remain_on_channel) {
- if (found->started) {
- ret = drv_cancel_remain_on_channel(local);
- if (WARN_ON_ONCE(ret)) {
- mutex_unlock(&local->mtx);
- return ret;
- }
- }
-
- list_del(&found->list);
-
- if (found->started)
- ieee80211_start_next_roc(local);
- mutex_unlock(&local->mtx);
-
- ieee80211_roc_notify_destroy(found, true);
- } else {
- /* work may be pending so use it all the time */
- found->abort = true;
- ieee80211_queue_delayed_work(&local->hw, &found->work, 0);
-
- mutex_unlock(&local->mtx);
-
- /* work will clean up etc */
- flush_delayed_work(&found->work);
- WARN_ON(!found->to_be_freed);
- kfree(found);
- }
-
- return 0;
-}
-
-static int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy,
- struct wireless_dev *wdev,
- u64 cookie)
-{
- struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
- struct ieee80211_local *local = sdata->local;
-
- return ieee80211_cancel_roc(local, cookie, false);
-}
-
static int ieee80211_start_radar_detection(struct wiphy *wiphy,
struct net_device *dev,
struct cfg80211_chan_def *chandef,
@@ -3267,9 +3003,21 @@ int ieee80211_channel_switch(struct wiphy *wiphy, struct net_device *dev,
return err;
}
-static struct sk_buff *ieee80211_make_ack_skb(struct ieee80211_local *local,
- struct sk_buff *skb, u64 *cookie,
- gfp_t gfp)
+u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local)
+{
+ lockdep_assert_held(&local->mtx);
+
+ local->roc_cookie_counter++;
+
+ /* wow, you wrapped 64 bits ... more likely a bug */
+ if (WARN_ON(local->roc_cookie_counter == 0))
+ local->roc_cookie_counter++;
+
+ return local->roc_cookie_counter;
+}
+
+int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb,
+ u64 *cookie, gfp_t gfp)
{
unsigned long spin_flags;
struct sk_buff *ack_skb;
@@ -3277,7 +3025,7 @@ static struct sk_buff *ieee80211_make_ack_skb(struct ieee80211_local *local,
ack_skb = skb_copy(skb, gfp);
if (!ack_skb)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
spin_lock_irqsave(&local->ack_status_lock, spin_flags);
id = idr_alloc(&local->ack_status_frames, ack_skb,
@@ -3286,7 +3034,7 @@ static struct sk_buff *ieee80211_make_ack_skb(struct ieee80211_local *local,
if (id < 0) {
kfree_skb(ack_skb);
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
}
IEEE80211_SKB_CB(skb)->ack_frame_id = id;
@@ -3294,200 +3042,7 @@ static struct sk_buff *ieee80211_make_ack_skb(struct ieee80211_local *local,
*cookie = ieee80211_mgmt_tx_cookie(local);
IEEE80211_SKB_CB(ack_skb)->ack.cookie = *cookie;
- return ack_skb;
-}
-
-static int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
- struct cfg80211_mgmt_tx_params *params,
- u64 *cookie)
-{
- struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
- struct ieee80211_local *local = sdata->local;
- struct sk_buff *skb, *ack_skb;
- struct sta_info *sta;
- const struct ieee80211_mgmt *mgmt = (void *)params->buf;
- bool need_offchan = false;
- u32 flags;
- int ret;
- u8 *data;
-
- if (params->dont_wait_for_ack)
- flags = IEEE80211_TX_CTL_NO_ACK;
- else
- flags = IEEE80211_TX_INTFL_NL80211_FRAME_TX |
- IEEE80211_TX_CTL_REQ_TX_STATUS;
-
- if (params->no_cck)
- flags |= IEEE80211_TX_CTL_NO_CCK_RATE;
-
- switch (sdata->vif.type) {
- case NL80211_IFTYPE_ADHOC:
- if (!sdata->vif.bss_conf.ibss_joined)
- need_offchan = true;
- /* fall through */
-#ifdef CONFIG_MAC80211_MESH
- case NL80211_IFTYPE_MESH_POINT:
- if (ieee80211_vif_is_mesh(&sdata->vif) &&
- !sdata->u.mesh.mesh_id_len)
- need_offchan = true;
- /* fall through */
-#endif
- case NL80211_IFTYPE_AP:
- case NL80211_IFTYPE_AP_VLAN:
- case NL80211_IFTYPE_P2P_GO:
- if (sdata->vif.type != NL80211_IFTYPE_ADHOC &&
- !ieee80211_vif_is_mesh(&sdata->vif) &&
- !rcu_access_pointer(sdata->bss->beacon))
- need_offchan = true;
- if (!ieee80211_is_action(mgmt->frame_control) ||
- mgmt->u.action.category == WLAN_CATEGORY_PUBLIC ||
- mgmt->u.action.category == WLAN_CATEGORY_SELF_PROTECTED ||
- mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT)
- break;
- rcu_read_lock();
- sta = sta_info_get(sdata, mgmt->da);
- rcu_read_unlock();
- if (!sta)
- return -ENOLINK;
- break;
- case NL80211_IFTYPE_STATION:
- case NL80211_IFTYPE_P2P_CLIENT:
- sdata_lock(sdata);
- if (!sdata->u.mgd.associated ||
- (params->offchan && params->wait &&
- local->ops->remain_on_channel &&
- memcmp(sdata->u.mgd.associated->bssid,
- mgmt->bssid, ETH_ALEN)))
- need_offchan = true;
- sdata_unlock(sdata);
- break;
- case NL80211_IFTYPE_P2P_DEVICE:
- need_offchan = true;
- break;
- default:
- return -EOPNOTSUPP;
- }
-
- /* configurations requiring offchan cannot work if no channel has been
- * specified
- */
- if (need_offchan && !params->chan)
- return -EINVAL;
-
- mutex_lock(&local->mtx);
-
- /* Check if the operating channel is the requested channel */
- if (!need_offchan) {
- struct ieee80211_chanctx_conf *chanctx_conf;
-
- rcu_read_lock();
- chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
-
- if (chanctx_conf) {
- need_offchan = params->chan &&
- (params->chan !=
- chanctx_conf->def.chan);
- } else if (!params->chan) {
- ret = -EINVAL;
- rcu_read_unlock();
- goto out_unlock;
- } else {
- need_offchan = true;
- }
- rcu_read_unlock();
- }
-
- if (need_offchan && !params->offchan) {
- ret = -EBUSY;
- goto out_unlock;
- }
-
- skb = dev_alloc_skb(local->hw.extra_tx_headroom + params->len);
- if (!skb) {
- ret = -ENOMEM;
- goto out_unlock;
- }
- skb_reserve(skb, local->hw.extra_tx_headroom);
-
- data = skb_put(skb, params->len);
- memcpy(data, params->buf, params->len);
-
- /* Update CSA counters */
- if (sdata->vif.csa_active &&
- (sdata->vif.type == NL80211_IFTYPE_AP ||
- sdata->vif.type == NL80211_IFTYPE_MESH_POINT ||
- sdata->vif.type == NL80211_IFTYPE_ADHOC) &&
- params->n_csa_offsets) {
- int i;
- struct beacon_data *beacon = NULL;
-
- rcu_read_lock();
-
- if (sdata->vif.type == NL80211_IFTYPE_AP)
- beacon = rcu_dereference(sdata->u.ap.beacon);
- else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
- beacon = rcu_dereference(sdata->u.ibss.presp);
- else if (ieee80211_vif_is_mesh(&sdata->vif))
- beacon = rcu_dereference(sdata->u.mesh.beacon);
-
- if (beacon)
- for (i = 0; i < params->n_csa_offsets; i++)
- data[params->csa_offsets[i]] =
- beacon->csa_current_counter;
-
- rcu_read_unlock();
- }
-
- IEEE80211_SKB_CB(skb)->flags = flags;
-
- skb->dev = sdata->dev;
-
- if (!params->dont_wait_for_ack) {
- /* make a copy to preserve the frame contents
- * in case of encryption.
- */
- ack_skb = ieee80211_make_ack_skb(local, skb, cookie,
- GFP_KERNEL);
- if (IS_ERR(ack_skb)) {
- ret = PTR_ERR(ack_skb);
- kfree_skb(skb);
- goto out_unlock;
- }
- } else {
- /* for cookie below */
- ack_skb = skb;
- }
-
- if (!need_offchan) {
- ieee80211_tx_skb(sdata, skb);
- ret = 0;
- goto out_unlock;
- }
-
- IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN |
- IEEE80211_TX_INTFL_OFFCHAN_TX_OK;
- if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL))
- IEEE80211_SKB_CB(skb)->hw_queue =
- local->hw.offchannel_tx_hw_queue;
-
- /* This will handle all kinds of coalescing and immediate TX */
- ret = ieee80211_start_roc_work(local, sdata, params->chan,
- params->wait, cookie, skb,
- IEEE80211_ROC_TYPE_MGMT_TX);
- if (ret)
- kfree_skb(skb);
- out_unlock:
- mutex_unlock(&local->mtx);
- return ret;
-}
-
-static int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy,
- struct wireless_dev *wdev,
- u64 cookie)
-{
- struct ieee80211_local *local = wiphy_priv(wiphy);
-
- return ieee80211_cancel_roc(local, cookie, true);
+ return 0;
}
static void ieee80211_mgmt_frame_register(struct wiphy *wiphy,
@@ -3565,7 +3120,7 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev,
struct ieee80211_sub_if_data *sdata = IEEE80211_DEV_TO_SUB_IF(dev);
struct ieee80211_local *local = sdata->local;
struct ieee80211_qos_hdr *nullfunc;
- struct sk_buff *skb, *ack_skb;
+ struct sk_buff *skb;
int size = sizeof(*nullfunc);
__le16 fc;
bool qos;
@@ -3633,10 +3188,9 @@ static int ieee80211_probe_client(struct wiphy *wiphy, struct net_device *dev,
if (qos)
nullfunc->qos_ctrl = cpu_to_le16(7);
- ack_skb = ieee80211_make_ack_skb(local, skb, cookie, GFP_ATOMIC);
- if (IS_ERR(ack_skb)) {
+ ret = ieee80211_attach_ack_skb(local, skb, cookie, GFP_ATOMIC);
+ if (ret) {
kfree_skb(skb);
- ret = PTR_ERR(ack_skb);
goto unlock;
}
@@ -3838,6 +3392,7 @@ const struct cfg80211_ops mac80211_config_ops = {
.suspend = ieee80211_suspend,
.resume = ieee80211_resume,
.scan = ieee80211_scan,
+ .abort_scan = ieee80211_abort_scan,
.sched_scan_start = ieee80211_sched_scan_start,
.sched_scan_stop = ieee80211_sched_scan_stop,
.auth = ieee80211_auth,
diff --git a/net/mac80211/chan.c b/net/mac80211/chan.c
index 1d1b9b7bdefe..74142d07ad31 100644
--- a/net/mac80211/chan.c
+++ b/net/mac80211/chan.c
@@ -231,7 +231,7 @@ ieee80211_get_max_required_bw(struct ieee80211_sub_if_data *sdata)
!(sta->sdata->bss && sta->sdata->bss == sdata->bss))
continue;
- if (!sta->uploaded)
+ if (!sta->uploaded || !test_sta_flag(sta, WLAN_STA_ASSOC))
continue;
max_bw = max(max_bw, ieee80211_get_sta_bw(&sta->sta));
@@ -343,8 +343,10 @@ static void ieee80211_change_chanctx(struct ieee80211_local *local,
struct ieee80211_chanctx *ctx,
const struct cfg80211_chan_def *chandef)
{
- if (cfg80211_chandef_identical(&ctx->conf.def, chandef))
+ if (cfg80211_chandef_identical(&ctx->conf.def, chandef)) {
+ ieee80211_recalc_chanctx_min_def(local, ctx);
return;
+ }
WARN_ON(!cfg80211_chandef_compatible(&ctx->conf.def, chandef));
diff --git a/net/mac80211/debugfs.c b/net/mac80211/debugfs.c
index 4d2aaebd4f97..4ab5c522ceee 100644
--- a/net/mac80211/debugfs.c
+++ b/net/mac80211/debugfs.c
@@ -91,7 +91,7 @@ static const struct file_operations reset_ops = {
};
#endif
-static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = {
+static const char *hw_flag_names[] = {
#define FLAG(F) [IEEE80211_HW_##F] = #F
FLAG(HAS_RATE_CONTROL),
FLAG(RX_INCLUDES_FCS),
@@ -125,9 +125,8 @@ static const char *hw_flag_names[NUM_IEEE80211_HW_FLAGS + 1] = {
FLAG(TDLS_WIDER_BW),
FLAG(SUPPORTS_AMSDU_IN_AMPDU),
FLAG(BEACON_TX_STATUS),
-
- /* keep last for the build bug below */
- (void *)0x1
+ FLAG(NEEDS_UNIQUE_STA_ADDR),
+ FLAG(SUPPORTS_REORDERING_BUFFER),
#undef FLAG
};
@@ -147,7 +146,7 @@ static ssize_t hwflags_read(struct file *file, char __user *user_buf,
/* fail compilation if somebody adds or removes
* a flag without updating the name array above
*/
- BUILD_BUG_ON(hw_flag_names[NUM_IEEE80211_HW_FLAGS] != (void *)0x1);
+ BUILD_BUG_ON(ARRAY_SIZE(hw_flag_names) != NUM_IEEE80211_HW_FLAGS);
for (i = 0; i < NUM_IEEE80211_HW_FLAGS; i++) {
if (test_bit(i, local->hw.flags))
diff --git a/net/mac80211/debugfs_key.c b/net/mac80211/debugfs_key.c
index 7961e7d0b61e..a2ef95f16f11 100644
--- a/net/mac80211/debugfs_key.c
+++ b/net/mac80211/debugfs_key.c
@@ -132,9 +132,10 @@ static ssize_t key_tx_spec_read(struct file *file, char __user *userbuf,
len = scnprintf(buf, sizeof(buf), "\n");
break;
case WLAN_CIPHER_SUITE_TKIP:
+ pn = atomic64_read(&key->conf.tx_pn);
len = scnprintf(buf, sizeof(buf), "%08x %04x\n",
- key->u.tkip.tx.iv32,
- key->u.tkip.tx.iv16);
+ TKIP_PN_TO_IV32(pn),
+ TKIP_PN_TO_IV16(pn));
break;
case WLAN_CIPHER_SUITE_CCMP:
case WLAN_CIPHER_SUITE_CCMP_256:
diff --git a/net/mac80211/driver-ops.c b/net/mac80211/driver-ops.c
index ca1fe5576103..c258f1041d33 100644
--- a/net/mac80211/driver-ops.c
+++ b/net/mac80211/driver-ops.c
@@ -284,9 +284,7 @@ int drv_switch_vif_chanctx(struct ieee80211_local *local,
int drv_ampdu_action(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
- enum ieee80211_ampdu_mlme_action action,
- struct ieee80211_sta *sta, u16 tid,
- u16 *ssn, u8 buf_size, bool amsdu)
+ struct ieee80211_ampdu_params *params)
{
int ret = -EOPNOTSUPP;
@@ -296,12 +294,10 @@ int drv_ampdu_action(struct ieee80211_local *local,
if (!check_sdata_in_driver(sdata))
return -EIO;
- trace_drv_ampdu_action(local, sdata, action, sta, tid,
- ssn, buf_size, amsdu);
+ trace_drv_ampdu_action(local, sdata, params);
if (local->ops->ampdu_action)
- ret = local->ops->ampdu_action(&local->hw, &sdata->vif, action,
- sta, tid, ssn, buf_size, amsdu);
+ ret = local->ops->ampdu_action(&local->hw, &sdata->vif, params);
trace_drv_return_int(local, ret);
diff --git a/net/mac80211/driver-ops.h b/net/mac80211/driver-ops.h
index 154ce4b13406..18b0d65baff0 100644
--- a/net/mac80211/driver-ops.h
+++ b/net/mac80211/driver-ops.h
@@ -585,9 +585,7 @@ static inline int drv_tx_last_beacon(struct ieee80211_local *local)
int drv_ampdu_action(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
- enum ieee80211_ampdu_mlme_action action,
- struct ieee80211_sta *sta, u16 tid,
- u16 *ssn, u8 buf_size, bool amsdu);
+ struct ieee80211_ampdu_params *params);
static inline int drv_get_survey(struct ieee80211_local *local, int idx,
struct survey_info *survey)
diff --git a/net/mac80211/ht.c b/net/mac80211/ht.c
index 7a76ce639d58..f4a528773563 100644
--- a/net/mac80211/ht.c
+++ b/net/mac80211/ht.c
@@ -230,6 +230,11 @@ bool ieee80211_ht_cap_ie_to_sta_ht_cap(struct ieee80211_sub_if_data *sdata,
/* set Rx highest rate */
ht_cap.mcs.rx_highest = ht_cap_ie->mcs.rx_highest;
+ if (ht_cap.cap & IEEE80211_HT_CAP_MAX_AMSDU)
+ sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_7935;
+ else
+ sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_HT_3839;
+
apply:
changed = memcmp(&sta->sta.ht_cap, &ht_cap, sizeof(ht_cap));
diff --git a/net/mac80211/ibss.c b/net/mac80211/ibss.c
index 337bb5d78003..fc3238376b39 100644
--- a/net/mac80211/ibss.c
+++ b/net/mac80211/ibss.c
@@ -7,6 +7,7 @@
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2009, Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright(c) 2016 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -428,6 +429,7 @@ static void ieee80211_sta_join_ibss(struct ieee80211_sub_if_data *sdata,
chandef.width = sdata->u.ibss.chandef.width;
break;
case NL80211_CHAN_WIDTH_80:
+ case NL80211_CHAN_WIDTH_80P80:
case NL80211_CHAN_WIDTH_160:
chandef = sdata->u.ibss.chandef;
chandef.chan = cbss->channel;
@@ -1049,9 +1051,8 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
struct cfg80211_chan_def chandef;
enum ieee80211_sta_rx_bandwidth bw = sta->sta.bandwidth;
- ieee80211_ht_oper_to_chandef(channel,
- elems->ht_operation,
- &chandef);
+ cfg80211_chandef_create(&chandef, channel, NL80211_CHAN_NO_HT);
+ ieee80211_chandef_ht_oper(elems->ht_operation, &chandef);
memcpy(&htcap_ie, elems->ht_cap_elem, sizeof(htcap_ie));
rates_updated |= ieee80211_ht_cap_ie_to_sta_ht_cap(sdata, sband,
@@ -1065,9 +1066,8 @@ static void ieee80211_update_sta_info(struct ieee80211_sub_if_data *sdata,
struct ieee80211_vht_cap cap_ie;
struct ieee80211_sta_vht_cap cap = sta->sta.vht_cap;
- ieee80211_vht_oper_to_chandef(channel,
- elems->vht_operation,
- &chandef);
+ ieee80211_chandef_vht_oper(elems->vht_operation,
+ &chandef);
memcpy(&cap_ie, elems->vht_cap_elem, sizeof(cap_ie));
ieee80211_vht_cap_ie_to_sta_vht_cap(sdata, sband,
&cap_ie, sta);
@@ -1484,14 +1484,21 @@ static void ieee80211_sta_find_ibss(struct ieee80211_sub_if_data *sdata)
sdata_info(sdata, "Trigger new scan to find an IBSS to join\n");
- num = ieee80211_ibss_setup_scan_channels(local->hw.wiphy,
- &ifibss->chandef,
- channels,
- ARRAY_SIZE(channels));
scan_width = cfg80211_chandef_to_scan_width(&ifibss->chandef);
- ieee80211_request_ibss_scan(sdata, ifibss->ssid,
- ifibss->ssid_len, channels, num,
- scan_width);
+
+ if (ifibss->fixed_channel) {
+ num = ieee80211_ibss_setup_scan_channels(local->hw.wiphy,
+ &ifibss->chandef,
+ channels,
+ ARRAY_SIZE(channels));
+ ieee80211_request_ibss_scan(sdata, ifibss->ssid,
+ ifibss->ssid_len, channels,
+ num, scan_width);
+ } else {
+ ieee80211_request_ibss_scan(sdata, ifibss->ssid,
+ ifibss->ssid_len, NULL,
+ 0, scan_width);
+ }
} else {
int interval = IEEE80211_SCAN_INTERVAL;
@@ -1732,7 +1739,6 @@ void ieee80211_ibss_notify_scan_completed(struct ieee80211_local *local)
if (sdata->vif.type != NL80211_IFTYPE_ADHOC)
continue;
sdata->u.ibss.last_scan_completed = jiffies;
- ieee80211_queue_work(&local->hw, &sdata->work);
}
mutex_unlock(&local->iflist_mtx);
}
diff --git a/net/mac80211/ieee80211_i.h b/net/mac80211/ieee80211_i.h
index d832bd59236b..422003540169 100644
--- a/net/mac80211/ieee80211_i.h
+++ b/net/mac80211/ieee80211_i.h
@@ -92,7 +92,7 @@ struct ieee80211_fragment_entry {
u16 extra_len;
u16 last_frag;
u8 rx_queue;
- bool ccmp; /* Whether fragments were encrypted with CCMP */
+ bool check_sequential_pn; /* needed for CCMP/GCMP */
u8 last_pn[6]; /* PN of the last fragment if CCMP was used */
};
@@ -325,19 +325,15 @@ struct mesh_preq_queue {
struct ieee80211_roc_work {
struct list_head list;
- struct list_head dependents;
-
- struct delayed_work work;
struct ieee80211_sub_if_data *sdata;
struct ieee80211_channel *chan;
bool started, abort, hw_begun, notified;
- bool to_be_freed;
bool on_channel;
- unsigned long hw_start_time;
+ unsigned long start_time;
u32 duration, req_duration;
struct sk_buff *frame;
@@ -720,7 +716,6 @@ struct ieee80211_if_mesh {
* back to wireless media and to the local net stack.
* @IEEE80211_SDATA_DISCONNECT_RESUME: Disconnect after resume.
* @IEEE80211_SDATA_IN_DRIVER: indicates interface was added to driver
- * @IEEE80211_SDATA_MU_MIMO_OWNER: indicates interface owns MU-MIMO capability
*/
enum ieee80211_sub_if_data_flags {
IEEE80211_SDATA_ALLMULTI = BIT(0),
@@ -728,7 +723,6 @@ enum ieee80211_sub_if_data_flags {
IEEE80211_SDATA_DONT_BRIDGE_PACKETS = BIT(3),
IEEE80211_SDATA_DISCONNECT_RESUME = BIT(4),
IEEE80211_SDATA_IN_DRIVER = BIT(5),
- IEEE80211_SDATA_MU_MIMO_OWNER = BIT(6),
};
/**
@@ -808,6 +802,7 @@ enum txq_info_flags {
struct txq_info {
struct sk_buff_head queue;
unsigned long flags;
+ unsigned long byte_cnt;
/* keep last! */
struct ieee80211_txq txq;
@@ -1335,6 +1330,7 @@ struct ieee80211_local {
/*
* Remain-on-channel support
*/
+ struct delayed_work roc_work;
struct list_head roc_list;
struct work_struct hw_roc_start, hw_roc_done;
unsigned long hw_roc_start_time;
@@ -1469,7 +1465,13 @@ ieee80211_have_rx_timestamp(struct ieee80211_rx_status *status)
{
WARN_ON_ONCE(status->flag & RX_FLAG_MACTIME_START &&
status->flag & RX_FLAG_MACTIME_END);
- return status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END);
+ if (status->flag & (RX_FLAG_MACTIME_START | RX_FLAG_MACTIME_END))
+ return true;
+ /* can't handle HT/VHT preamble yet */
+ if (status->flag & RX_FLAG_MACTIME_PLCP_START &&
+ !(status->flag & (RX_FLAG_HT | RX_FLAG_VHT)))
+ return true;
+ return false;
}
u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
@@ -1483,6 +1485,10 @@ void ieee80211_bss_info_change_notify(struct ieee80211_sub_if_data *sdata,
void ieee80211_configure_filter(struct ieee80211_local *local);
u32 ieee80211_reset_erp_info(struct ieee80211_sub_if_data *sdata);
+u64 ieee80211_mgmt_tx_cookie(struct ieee80211_local *local);
+int ieee80211_attach_ack_skb(struct ieee80211_local *local, struct sk_buff *skb,
+ u64 *cookie, gfp_t gfp);
+
/* STA code */
void ieee80211_sta_setup_sdata(struct ieee80211_sub_if_data *sdata);
int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
@@ -1577,16 +1583,22 @@ int ieee80211_request_sched_scan_stop(struct ieee80211_local *local);
void ieee80211_sched_scan_end(struct ieee80211_local *local);
void ieee80211_sched_scan_stopped_work(struct work_struct *work);
-/* off-channel helpers */
+/* off-channel/mgmt-tx */
void ieee80211_offchannel_stop_vifs(struct ieee80211_local *local);
void ieee80211_offchannel_return(struct ieee80211_local *local);
void ieee80211_roc_setup(struct ieee80211_local *local);
void ieee80211_start_next_roc(struct ieee80211_local *local);
void ieee80211_roc_purge(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata);
-void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc, bool free);
-void ieee80211_sw_roc_work(struct work_struct *work);
-void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc);
+int ieee80211_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev,
+ struct ieee80211_channel *chan,
+ unsigned int duration, u64 *cookie);
+int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy,
+ struct wireless_dev *wdev, u64 cookie);
+int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
+ struct cfg80211_mgmt_tx_params *params, u64 *cookie);
+int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy,
+ struct wireless_dev *wdev, u64 cookie);
/* channel switch handling */
void ieee80211_csa_finalize_work(struct work_struct *work);
@@ -1707,12 +1719,18 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta);
enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta);
void ieee80211_sta_set_rx_nss(struct sta_info *sta);
+enum ieee80211_sta_rx_bandwidth
+ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width);
+enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta);
+void ieee80211_sta_set_rx_nss(struct sta_info *sta);
+void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt);
u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band, bool nss_only);
+ enum ieee80211_band band);
void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band, bool nss_only);
+ enum ieee80211_band band);
void ieee80211_apply_vhtcap_overrides(struct ieee80211_sub_if_data *sdata,
struct ieee80211_sta_vht_cap *vht_cap);
void ieee80211_get_vht_mask_from_cap(__le16 vht_cap,
@@ -1822,20 +1840,6 @@ static inline void ieee802_11_parse_elems(const u8 *start, size_t len,
ieee802_11_parse_elems_crc(start, len, action, elems, 0, 0);
}
-static inline bool ieee80211_rx_reorder_ready(struct sk_buff_head *frames)
-{
- struct sk_buff *tail = skb_peek_tail(frames);
- struct ieee80211_rx_status *status;
-
- if (!tail)
- return false;
-
- status = IEEE80211_SKB_RXCB(tail);
- if (status->flag & RX_FLAG_AMSDU_MORE)
- return false;
-
- return true;
-}
extern const int ieee802_1d_to_ac[8];
@@ -1979,12 +1983,10 @@ int ieee80211_add_ext_srates_ie(struct ieee80211_sub_if_data *sdata,
u8 *ieee80211_add_wmm_info_ie(u8 *buf, u8 qosinfo);
/* channel management */
-void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan,
- const struct ieee80211_ht_operation *ht_oper,
- struct cfg80211_chan_def *chandef);
-void ieee80211_vht_oper_to_chandef(struct ieee80211_channel *control_chan,
- const struct ieee80211_vht_operation *oper,
- struct cfg80211_chan_def *chandef);
+bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
+ struct cfg80211_chan_def *chandef);
+bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper,
+ struct cfg80211_chan_def *chandef);
u32 ieee80211_chandef_downgrade(struct cfg80211_chan_def *c);
int __must_check
diff --git a/net/mac80211/iface.c b/net/mac80211/iface.c
index d0dc1bfaeec2..453b4e741780 100644
--- a/net/mac80211/iface.c
+++ b/net/mac80211/iface.c
@@ -76,7 +76,8 @@ bool __ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata)
void ieee80211_recalc_txpower(struct ieee80211_sub_if_data *sdata,
bool update_bss)
{
- if (__ieee80211_recalc_txpower(sdata) || update_bss)
+ if (__ieee80211_recalc_txpower(sdata) ||
+ (update_bss && ieee80211_sdata_running(sdata)))
ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_TXPOWER);
}
@@ -976,7 +977,11 @@ static void ieee80211_do_stop(struct ieee80211_sub_if_data *sdata,
if (sdata->vif.txq) {
struct txq_info *txqi = to_txq_info(sdata->vif.txq);
+ spin_lock_bh(&txqi->queue.lock);
ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
+ txqi->byte_cnt = 0;
+ spin_unlock_bh(&txqi->queue.lock);
+
atomic_set(&sdata->txqs_len[txqi->txq.ac], 0);
}
@@ -1270,6 +1275,16 @@ static void ieee80211_iface_work(struct work_struct *work)
}
}
mutex_unlock(&local->sta_mtx);
+ } else if (ieee80211_is_action(mgmt->frame_control) &&
+ mgmt->u.action.category == WLAN_CATEGORY_VHT) {
+ switch (mgmt->u.action.u.vht_group_notif.action_code) {
+ case WLAN_VHT_ACTION_GROUPID_MGMT:
+ ieee80211_process_mu_groups(sdata, mgmt);
+ break;
+ default:
+ WARN_ON(1);
+ break;
+ }
} else if (ieee80211_is_data_qos(mgmt->frame_control)) {
struct ieee80211_hdr *hdr = (void *)mgmt;
/*
@@ -1861,6 +1876,7 @@ void ieee80211_if_remove(struct ieee80211_sub_if_data *sdata)
unregister_netdevice(sdata->dev);
} else {
cfg80211_unregister_wdev(&sdata->wdev);
+ ieee80211_teardown_sdata(sdata);
kfree(sdata);
}
}
@@ -1870,7 +1886,6 @@ void ieee80211_sdata_stop(struct ieee80211_sub_if_data *sdata)
if (WARN_ON_ONCE(!test_bit(SDATA_STATE_RUNNING, &sdata->state)))
return;
ieee80211_do_stop(sdata, true);
- ieee80211_teardown_sdata(sdata);
}
void ieee80211_remove_interfaces(struct ieee80211_local *local)
diff --git a/net/mac80211/key.c b/net/mac80211/key.c
index 44388d6a1d8e..3df7b0392d30 100644
--- a/net/mac80211/key.c
+++ b/net/mac80211/key.c
@@ -4,6 +4,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007-2008 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright 2015 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -320,7 +321,7 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
return;
if (new)
- list_add_tail(&new->list, &sdata->key_list);
+ list_add_tail_rcu(&new->list, &sdata->key_list);
WARN_ON(new && old && new->conf.keyidx != old->conf.keyidx);
@@ -368,7 +369,7 @@ static void ieee80211_key_replace(struct ieee80211_sub_if_data *sdata,
}
if (old)
- list_del(&old->list);
+ list_del_rcu(&old->list);
}
struct ieee80211_key *
@@ -592,8 +593,8 @@ static void ieee80211_key_destroy(struct ieee80211_key *key,
return;
/*
- * Synchronize so the TX path can no longer be using
- * this key before we free/remove it.
+ * Synchronize so the TX path and rcu key iterators
+ * can no longer be using this key before we free/remove it.
*/
synchronize_net();
@@ -744,6 +745,53 @@ void ieee80211_iter_keys(struct ieee80211_hw *hw,
}
EXPORT_SYMBOL(ieee80211_iter_keys);
+static void
+_ieee80211_iter_keys_rcu(struct ieee80211_hw *hw,
+ struct ieee80211_sub_if_data *sdata,
+ void (*iter)(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ struct ieee80211_sta *sta,
+ struct ieee80211_key_conf *key,
+ void *data),
+ void *iter_data)
+{
+ struct ieee80211_key *key;
+
+ list_for_each_entry_rcu(key, &sdata->key_list, list) {
+ /* skip keys of station in removal process */
+ if (key->sta && key->sta->removed)
+ continue;
+ if (!(key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE))
+ continue;
+
+ iter(hw, &sdata->vif,
+ key->sta ? &key->sta->sta : NULL,
+ &key->conf, iter_data);
+ }
+}
+
+void ieee80211_iter_keys_rcu(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ void (*iter)(struct ieee80211_hw *hw,
+ struct ieee80211_vif *vif,
+ struct ieee80211_sta *sta,
+ struct ieee80211_key_conf *key,
+ void *data),
+ void *iter_data)
+{
+ struct ieee80211_local *local = hw_to_local(hw);
+ struct ieee80211_sub_if_data *sdata;
+
+ if (vif) {
+ sdata = vif_to_sdata(vif);
+ _ieee80211_iter_keys_rcu(hw, sdata, iter, iter_data);
+ } else {
+ list_for_each_entry_rcu(sdata, &local->interfaces, list)
+ _ieee80211_iter_keys_rcu(hw, sdata, iter, iter_data);
+ }
+}
+EXPORT_SYMBOL(ieee80211_iter_keys_rcu);
+
static void ieee80211_free_keys_iface(struct ieee80211_sub_if_data *sdata,
struct list_head *keys)
{
@@ -884,50 +932,6 @@ void ieee80211_gtk_rekey_notify(struct ieee80211_vif *vif, const u8 *bssid,
}
EXPORT_SYMBOL_GPL(ieee80211_gtk_rekey_notify);
-void ieee80211_get_key_tx_seq(struct ieee80211_key_conf *keyconf,
- struct ieee80211_key_seq *seq)
-{
- struct ieee80211_key *key;
- u64 pn64;
-
- if (WARN_ON(!(keyconf->flags & IEEE80211_KEY_FLAG_GENERATE_IV)))
- return;
-
- key = container_of(keyconf, struct ieee80211_key, conf);
-
- switch (key->conf.cipher) {
- case WLAN_CIPHER_SUITE_TKIP:
- seq->tkip.iv32 = key->u.tkip.tx.iv32;
- seq->tkip.iv16 = key->u.tkip.tx.iv16;
- break;
- case WLAN_CIPHER_SUITE_CCMP:
- case WLAN_CIPHER_SUITE_CCMP_256:
- case WLAN_CIPHER_SUITE_AES_CMAC:
- case WLAN_CIPHER_SUITE_BIP_CMAC_256:
- BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
- offsetof(typeof(*seq), aes_cmac));
- case WLAN_CIPHER_SUITE_BIP_GMAC_128:
- case WLAN_CIPHER_SUITE_BIP_GMAC_256:
- BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
- offsetof(typeof(*seq), aes_gmac));
- case WLAN_CIPHER_SUITE_GCMP:
- case WLAN_CIPHER_SUITE_GCMP_256:
- BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
- offsetof(typeof(*seq), gcmp));
- pn64 = atomic64_read(&key->conf.tx_pn);
- seq->ccmp.pn[5] = pn64;
- seq->ccmp.pn[4] = pn64 >> 8;
- seq->ccmp.pn[3] = pn64 >> 16;
- seq->ccmp.pn[2] = pn64 >> 24;
- seq->ccmp.pn[1] = pn64 >> 32;
- seq->ccmp.pn[0] = pn64 >> 40;
- break;
- default:
- WARN_ON(1);
- }
-}
-EXPORT_SYMBOL(ieee80211_get_key_tx_seq);
-
void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf,
int tid, struct ieee80211_key_seq *seq)
{
@@ -981,48 +985,6 @@ void ieee80211_get_key_rx_seq(struct ieee80211_key_conf *keyconf,
}
EXPORT_SYMBOL(ieee80211_get_key_rx_seq);
-void ieee80211_set_key_tx_seq(struct ieee80211_key_conf *keyconf,
- struct ieee80211_key_seq *seq)
-{
- struct ieee80211_key *key;
- u64 pn64;
-
- key = container_of(keyconf, struct ieee80211_key, conf);
-
- switch (key->conf.cipher) {
- case WLAN_CIPHER_SUITE_TKIP:
- key->u.tkip.tx.iv32 = seq->tkip.iv32;
- key->u.tkip.tx.iv16 = seq->tkip.iv16;
- break;
- case WLAN_CIPHER_SUITE_CCMP:
- case WLAN_CIPHER_SUITE_CCMP_256:
- case WLAN_CIPHER_SUITE_AES_CMAC:
- case WLAN_CIPHER_SUITE_BIP_CMAC_256:
- BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
- offsetof(typeof(*seq), aes_cmac));
- case WLAN_CIPHER_SUITE_BIP_GMAC_128:
- case WLAN_CIPHER_SUITE_BIP_GMAC_256:
- BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
- offsetof(typeof(*seq), aes_gmac));
- case WLAN_CIPHER_SUITE_GCMP:
- case WLAN_CIPHER_SUITE_GCMP_256:
- BUILD_BUG_ON(offsetof(typeof(*seq), ccmp) !=
- offsetof(typeof(*seq), gcmp));
- pn64 = (u64)seq->ccmp.pn[5] |
- ((u64)seq->ccmp.pn[4] << 8) |
- ((u64)seq->ccmp.pn[3] << 16) |
- ((u64)seq->ccmp.pn[2] << 24) |
- ((u64)seq->ccmp.pn[1] << 32) |
- ((u64)seq->ccmp.pn[0] << 40);
- atomic64_set(&key->conf.tx_pn, pn64);
- break;
- default:
- WARN_ON(1);
- break;
- }
-}
-EXPORT_SYMBOL_GPL(ieee80211_set_key_tx_seq);
-
void ieee80211_set_key_rx_seq(struct ieee80211_key_conf *keyconf,
int tid, struct ieee80211_key_seq *seq)
{
diff --git a/net/mac80211/key.h b/net/mac80211/key.h
index 9951ef06323e..4aa20cef0859 100644
--- a/net/mac80211/key.h
+++ b/net/mac80211/key.h
@@ -44,13 +44,17 @@ enum ieee80211_internal_tkip_state {
};
struct tkip_ctx {
- u32 iv32; /* current iv32 */
- u16 iv16; /* current iv16 */
u16 p1k[5]; /* p1k cache */
u32 p1k_iv32; /* iv32 for which p1k computed */
enum ieee80211_internal_tkip_state state;
};
+struct tkip_ctx_rx {
+ struct tkip_ctx ctx;
+ u32 iv32; /* current iv32 */
+ u16 iv16; /* current iv16 */
+};
+
struct ieee80211_key {
struct ieee80211_local *local;
struct ieee80211_sub_if_data *sdata;
@@ -71,7 +75,7 @@ struct ieee80211_key {
struct tkip_ctx tx;
/* last received RSC */
- struct tkip_ctx rx[IEEE80211_NUM_TIDS];
+ struct tkip_ctx_rx rx[IEEE80211_NUM_TIDS];
/* number of mic failures */
u32 mic_failures;
diff --git a/net/mac80211/main.c b/net/mac80211/main.c
index 858f6b1cb149..8190bf27ebff 100644
--- a/net/mac80211/main.c
+++ b/net/mac80211/main.c
@@ -248,6 +248,7 @@ static void ieee80211_restart_work(struct work_struct *work)
/* wait for scan work complete */
flush_workqueue(local->workqueue);
+ flush_work(&local->sched_scan_stopped_work);
WARN(test_bit(SCAN_HW_SCANNING, &local->scanning),
"%s called with hardware scan in progress\n", __func__);
@@ -256,6 +257,11 @@ static void ieee80211_restart_work(struct work_struct *work)
list_for_each_entry(sdata, &local->interfaces, list)
flush_delayed_work(&sdata->dec_tailroom_needed_wk);
ieee80211_scan_cancel(local);
+
+ /* make sure any new ROC will consider local->in_reconfig */
+ flush_delayed_work(&local->roc_work);
+ flush_work(&local->hw_roc_done);
+
ieee80211_reconfig(local);
rtnl_unlock();
}
@@ -1149,6 +1155,7 @@ void ieee80211_unregister_hw(struct ieee80211_hw *hw)
rtnl_unlock();
+ cancel_delayed_work_sync(&local->roc_work);
cancel_work_sync(&local->restart_work);
cancel_work_sync(&local->reconfig_filter);
cancel_work_sync(&local->tdls_chsw_work);
diff --git a/net/mac80211/mesh.c b/net/mac80211/mesh.c
index fa28500f28fd..d32cefcb63b0 100644
--- a/net/mac80211/mesh.c
+++ b/net/mac80211/mesh.c
@@ -91,11 +91,10 @@ bool mesh_matches_local(struct ieee80211_sub_if_data *sdata,
if (sdata->vif.bss_conf.basic_rates != basic_rates)
return false;
- ieee80211_ht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan,
- ie->ht_operation, &sta_chan_def);
-
- ieee80211_vht_oper_to_chandef(sdata->vif.bss_conf.chandef.chan,
- ie->vht_operation, &sta_chan_def);
+ cfg80211_chandef_create(&sta_chan_def, sdata->vif.bss_conf.chandef.chan,
+ NL80211_CHAN_NO_HT);
+ ieee80211_chandef_ht_oper(ie->ht_operation, &sta_chan_def);
+ ieee80211_chandef_vht_oper(ie->vht_operation, &sta_chan_def);
if (!cfg80211_chandef_compatible(&sdata->vif.bss_conf.chandef,
&sta_chan_def))
@@ -1370,17 +1369,6 @@ out:
sdata_unlock(sdata);
}
-void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local)
-{
- struct ieee80211_sub_if_data *sdata;
-
- rcu_read_lock();
- list_for_each_entry_rcu(sdata, &local->interfaces, list)
- if (ieee80211_vif_is_mesh(&sdata->vif) &&
- ieee80211_sdata_running(sdata))
- ieee80211_queue_work(&local->hw, &sdata->work);
- rcu_read_unlock();
-}
void ieee80211_mesh_init_sdata(struct ieee80211_sub_if_data *sdata)
{
diff --git a/net/mac80211/mesh.h b/net/mac80211/mesh.h
index a1596344c3ba..87c017a3b1ce 100644
--- a/net/mac80211/mesh.h
+++ b/net/mac80211/mesh.h
@@ -137,8 +137,6 @@ struct mesh_path {
* @copy_node: function to copy nodes of the table
* @size_order: determines size of the table, there will be 2^size_order hash
* buckets
- * @mean_chain_len: maximum average length for the hash buckets' list, if it is
- * reached, the table will grow
* @known_gates: list of known mesh gates and their mpaths by the station. The
* gate's mpath may or may not be resolved and active.
*
@@ -154,7 +152,6 @@ struct mesh_table {
void (*free_node) (struct hlist_node *p, bool free_leafs);
int (*copy_node) (struct hlist_node *p, struct mesh_table *newtbl);
int size_order;
- int mean_chain_len;
struct hlist_head *known_gates;
spinlock_t gates_lock;
@@ -362,14 +359,10 @@ static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
return sdata->u.mesh.mesh_pp_id == IEEE80211_PATH_PROTOCOL_HWMP;
}
-void ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local);
-
void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata);
void mesh_sync_adjust_tbtt(struct ieee80211_sub_if_data *sdata);
void ieee80211s_stop(void);
#else
-static inline void
-ieee80211_mesh_notify_scan_completed(struct ieee80211_local *local) {}
static inline bool mesh_path_sel_is_hwmp(struct ieee80211_sub_if_data *sdata)
{ return false; }
static inline void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata)
diff --git a/net/mac80211/mesh_hwmp.c b/net/mac80211/mesh_hwmp.c
index c6be0b4f4058..002244bca948 100644
--- a/net/mac80211/mesh_hwmp.c
+++ b/net/mac80211/mesh_hwmp.c
@@ -205,9 +205,9 @@ static void prepare_frame_for_deferred_tx(struct ieee80211_sub_if_data *sdata,
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
- skb_set_mac_header(skb, 0);
- skb_set_network_header(skb, 0);
- skb_set_transport_header(skb, 0);
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
/* Send all internal mgmt frames on VO. Accordingly set TID to 7. */
skb_set_queue_mapping(skb, IEEE80211_AC_VO);
@@ -530,7 +530,7 @@ static void hwmp_preq_frame_process(struct ieee80211_sub_if_data *sdata,
const u8 *target_addr, *orig_addr;
const u8 *da;
u8 target_flags, ttl, flags;
- u32 orig_sn, target_sn, lifetime, target_metric;
+ u32 orig_sn, target_sn, lifetime, target_metric = 0;
bool reply = false;
bool forward = true;
bool root_is_gate;
diff --git a/net/mac80211/mesh_pathtbl.c b/net/mac80211/mesh_pathtbl.c
index b890e225a8f1..2ba7aa56b11c 100644
--- a/net/mac80211/mesh_pathtbl.c
+++ b/net/mac80211/mesh_pathtbl.c
@@ -55,16 +55,21 @@ int mpp_paths_generation;
static DEFINE_RWLOCK(pathtbl_resize_lock);
+static inline struct mesh_table *resize_dereference_paths(
+ struct mesh_table __rcu *table)
+{
+ return rcu_dereference_protected(table,
+ lockdep_is_held(&pathtbl_resize_lock));
+}
+
static inline struct mesh_table *resize_dereference_mesh_paths(void)
{
- return rcu_dereference_protected(mesh_paths,
- lockdep_is_held(&pathtbl_resize_lock));
+ return resize_dereference_paths(mesh_paths);
}
static inline struct mesh_table *resize_dereference_mpp_paths(void)
{
- return rcu_dereference_protected(mpp_paths,
- lockdep_is_held(&pathtbl_resize_lock));
+ return resize_dereference_paths(mpp_paths);
}
/*
@@ -160,11 +165,10 @@ static int mesh_table_grow(struct mesh_table *oldtbl,
int i;
if (atomic_read(&oldtbl->entries)
- < oldtbl->mean_chain_len * (oldtbl->hash_mask + 1))
+ < MEAN_CHAIN_LEN * (oldtbl->hash_mask + 1))
return -EAGAIN;
newtbl->free_node = oldtbl->free_node;
- newtbl->mean_chain_len = oldtbl->mean_chain_len;
newtbl->copy_node = oldtbl->copy_node;
newtbl->known_gates = oldtbl->known_gates;
atomic_set(&newtbl->entries, atomic_read(&oldtbl->entries));
@@ -585,7 +589,7 @@ struct mesh_path *mesh_path_add(struct ieee80211_sub_if_data *sdata,
hlist_add_head_rcu(&new_node->list, bucket);
if (atomic_inc_return(&tbl->entries) >=
- tbl->mean_chain_len * (tbl->hash_mask + 1))
+ MEAN_CHAIN_LEN * (tbl->hash_mask + 1))
grow = 1;
mesh_paths_generation++;
@@ -714,7 +718,7 @@ int mpp_path_add(struct ieee80211_sub_if_data *sdata,
hlist_add_head_rcu(&new_node->list, bucket);
if (atomic_inc_return(&tbl->entries) >=
- tbl->mean_chain_len * (tbl->hash_mask + 1))
+ MEAN_CHAIN_LEN * (tbl->hash_mask + 1))
grow = 1;
spin_unlock(&tbl->hashwlock[hash_idx]);
@@ -779,10 +783,8 @@ void mesh_plink_broken(struct sta_info *sta)
static void mesh_path_node_reclaim(struct rcu_head *rp)
{
struct mpath_node *node = container_of(rp, struct mpath_node, rcu);
- struct ieee80211_sub_if_data *sdata = node->mpath->sdata;
del_timer_sync(&node->mpath->timer);
- atomic_dec(&sdata->u.mesh.mpaths);
kfree(node->mpath);
kfree(node);
}
@@ -790,8 +792,9 @@ static void mesh_path_node_reclaim(struct rcu_head *rp)
/* needs to be called with the corresponding hashwlock taken */
static void __mesh_path_del(struct mesh_table *tbl, struct mpath_node *node)
{
- struct mesh_path *mpath;
- mpath = node->mpath;
+ struct mesh_path *mpath = node->mpath;
+ struct ieee80211_sub_if_data *sdata = node->mpath->sdata;
+
spin_lock(&mpath->state_lock);
mpath->flags |= MESH_PATH_RESOLVING;
if (mpath->is_gate)
@@ -799,6 +802,7 @@ static void __mesh_path_del(struct mesh_table *tbl, struct mpath_node *node)
hlist_del_rcu(&node->list);
call_rcu(&node->rcu, mesh_path_node_reclaim);
spin_unlock(&mpath->state_lock);
+ atomic_dec(&sdata->u.mesh.mpaths);
atomic_dec(&tbl->entries);
}
@@ -835,6 +839,29 @@ void mesh_path_flush_by_nexthop(struct sta_info *sta)
rcu_read_unlock();
}
+static void mpp_flush_by_proxy(struct ieee80211_sub_if_data *sdata,
+ const u8 *proxy)
+{
+ struct mesh_table *tbl;
+ struct mesh_path *mpp;
+ struct mpath_node *node;
+ int i;
+
+ rcu_read_lock();
+ read_lock_bh(&pathtbl_resize_lock);
+ tbl = resize_dereference_mpp_paths();
+ for_each_mesh_entry(tbl, node, i) {
+ mpp = node->mpath;
+ if (ether_addr_equal(mpp->mpp, proxy)) {
+ spin_lock(&tbl->hashwlock[i]);
+ __mesh_path_del(tbl, node);
+ spin_unlock(&tbl->hashwlock[i]);
+ }
+ }
+ read_unlock_bh(&pathtbl_resize_lock);
+ rcu_read_unlock();
+}
+
static void table_flush_by_iface(struct mesh_table *tbl,
struct ieee80211_sub_if_data *sdata)
{
@@ -876,14 +903,17 @@ void mesh_path_flush_by_iface(struct ieee80211_sub_if_data *sdata)
}
/**
- * mesh_path_del - delete a mesh path from the table
+ * table_path_del - delete a path from the mesh or mpp table
*
- * @addr: dst address (ETH_ALEN length)
+ * @tbl: mesh or mpp path table
* @sdata: local subif
+ * @addr: dst address (ETH_ALEN length)
*
* Returns: 0 if successful
*/
-int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr)
+static int table_path_del(struct mesh_table __rcu *rcu_tbl,
+ struct ieee80211_sub_if_data *sdata,
+ const u8 *addr)
{
struct mesh_table *tbl;
struct mesh_path *mpath;
@@ -892,8 +922,7 @@ int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr)
int hash_idx;
int err = 0;
- read_lock_bh(&pathtbl_resize_lock);
- tbl = resize_dereference_mesh_paths();
+ tbl = resize_dereference_paths(rcu_tbl);
hash_idx = mesh_table_hash(addr, sdata, tbl);
bucket = &tbl->hash_buckets[hash_idx];
@@ -909,9 +938,50 @@ int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr)
err = -ENXIO;
enddel:
- mesh_paths_generation++;
spin_unlock(&tbl->hashwlock[hash_idx]);
+ return err;
+}
+
+/**
+ * mesh_path_del - delete a mesh path from the table
+ *
+ * @addr: dst address (ETH_ALEN length)
+ * @sdata: local subif
+ *
+ * Returns: 0 if successful
+ */
+int mesh_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr)
+{
+ int err = 0;
+
+ /* flush relevant mpp entries first */
+ mpp_flush_by_proxy(sdata, addr);
+
+ read_lock_bh(&pathtbl_resize_lock);
+ err = table_path_del(mesh_paths, sdata, addr);
+ mesh_paths_generation++;
+ read_unlock_bh(&pathtbl_resize_lock);
+
+ return err;
+}
+
+/**
+ * mpp_path_del - delete a mesh proxy path from the table
+ *
+ * @addr: addr address (ETH_ALEN length)
+ * @sdata: local subif
+ *
+ * Returns: 0 if successful
+ */
+static int mpp_path_del(struct ieee80211_sub_if_data *sdata, const u8 *addr)
+{
+ int err = 0;
+
+ read_lock_bh(&pathtbl_resize_lock);
+ err = table_path_del(mpp_paths, sdata, addr);
+ mpp_paths_generation++;
read_unlock_bh(&pathtbl_resize_lock);
+
return err;
}
@@ -968,8 +1038,8 @@ int mesh_path_send_to_gates(struct mesh_path *mpath)
copy = true;
} else {
mpath_dbg(sdata,
- "Not forwarding %p (flags %#x)\n",
- gate->mpath, gate->mpath->flags);
+ "Not forwarding to %pM (flags %#x)\n",
+ gate->mpath->dst, gate->mpath->flags);
}
}
@@ -1076,7 +1146,6 @@ int mesh_pathtbl_init(void)
return -ENOMEM;
tbl_path->free_node = &mesh_path_node_free;
tbl_path->copy_node = &mesh_path_node_copy;
- tbl_path->mean_chain_len = MEAN_CHAIN_LEN;
tbl_path->known_gates = kzalloc(sizeof(struct hlist_head), GFP_ATOMIC);
if (!tbl_path->known_gates) {
ret = -ENOMEM;
@@ -1092,7 +1161,6 @@ int mesh_pathtbl_init(void)
}
tbl_mpp->free_node = &mesh_path_node_free;
tbl_mpp->copy_node = &mesh_path_node_copy;
- tbl_mpp->mean_chain_len = MEAN_CHAIN_LEN;
tbl_mpp->known_gates = kzalloc(sizeof(struct hlist_head), GFP_ATOMIC);
if (!tbl_mpp->known_gates) {
ret = -ENOMEM;
@@ -1131,6 +1199,17 @@ void mesh_path_expire(struct ieee80211_sub_if_data *sdata)
time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE))
mesh_path_del(mpath->sdata, mpath->dst);
}
+
+ tbl = rcu_dereference(mpp_paths);
+ for_each_mesh_entry(tbl, node, i) {
+ if (node->mpath->sdata != sdata)
+ continue;
+ mpath = node->mpath;
+ if ((!(mpath->flags & MESH_PATH_FIXED)) &&
+ time_after(jiffies, mpath->exp_time + MESH_PATH_EXPIRE))
+ mpp_path_del(mpath->sdata, mpath->dst);
+ }
+
rcu_read_unlock();
}
diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index bd3d55eb21d4..a07e93c21c9e 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -976,6 +976,10 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata,
mpl_dbg(sdata, "Mesh plink error: no more free plinks\n");
goto out;
}
+
+ /* new matching peer */
+ event = OPN_ACPT;
+ goto out;
} else {
if (!test_sta_flag(sta, WLAN_STA_AUTH)) {
mpl_dbg(sdata, "Mesh plink: Action frame from non-authed peer\n");
@@ -985,12 +989,6 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata,
goto out;
}
- /* new matching peer */
- if (!sta) {
- event = OPN_ACPT;
- goto out;
- }
-
switch (ftype) {
case WLAN_SP_MESH_PEERING_OPEN:
if (!matches_local)
diff --git a/net/mac80211/mlme.c b/net/mac80211/mlme.c
index b140cc6651f4..281b8d6e5109 100644
--- a/net/mac80211/mlme.c
+++ b/net/mac80211/mlme.c
@@ -6,7 +6,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007, Michael Wu <flamingice@sourmilk.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
- * Copyright (C) 2015 Intel Deutschland GmbH
+ * Copyright (C) 2015 - 2016 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -196,16 +196,7 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
/* check 40 MHz support, if we have it */
if (sta_ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40) {
- switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) {
- case IEEE80211_HT_PARAM_CHA_SEC_ABOVE:
- chandef->width = NL80211_CHAN_WIDTH_40;
- chandef->center_freq1 += 10;
- break;
- case IEEE80211_HT_PARAM_CHA_SEC_BELOW:
- chandef->width = NL80211_CHAN_WIDTH_40;
- chandef->center_freq1 -= 10;
- break;
- }
+ ieee80211_chandef_ht_oper(ht_oper, chandef);
} else {
/* 40 MHz (and 80 MHz) must be supported for VHT */
ret = IEEE80211_STA_DISABLE_VHT;
@@ -219,35 +210,11 @@ ieee80211_determine_chantype(struct ieee80211_sub_if_data *sdata,
goto out;
}
- vht_chandef.chan = channel;
- vht_chandef.center_freq1 =
- ieee80211_channel_to_frequency(vht_oper->center_freq_seg1_idx,
- channel->band);
- vht_chandef.center_freq2 = 0;
-
- switch (vht_oper->chan_width) {
- case IEEE80211_VHT_CHANWIDTH_USE_HT:
- vht_chandef.width = chandef->width;
- vht_chandef.center_freq1 = chandef->center_freq1;
- break;
- case IEEE80211_VHT_CHANWIDTH_80MHZ:
- vht_chandef.width = NL80211_CHAN_WIDTH_80;
- break;
- case IEEE80211_VHT_CHANWIDTH_160MHZ:
- vht_chandef.width = NL80211_CHAN_WIDTH_160;
- break;
- case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
- vht_chandef.width = NL80211_CHAN_WIDTH_80P80;
- vht_chandef.center_freq2 =
- ieee80211_channel_to_frequency(
- vht_oper->center_freq_seg2_idx,
- channel->band);
- break;
- default:
+ vht_chandef = *chandef;
+ if (!ieee80211_chandef_vht_oper(vht_oper, &vht_chandef)) {
if (!(ifmgd->flags & IEEE80211_STA_DISABLE_VHT))
sdata_info(sdata,
- "AP VHT operation IE has invalid channel width (%d), disable VHT\n",
- vht_oper->chan_width);
+ "AP VHT information is invalid, disable VHT\n");
ret = IEEE80211_STA_DISABLE_VHT;
goto out;
}
@@ -592,7 +559,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
struct ieee80211_sub_if_data *other;
list_for_each_entry_rcu(other, &local->interfaces, list) {
- if (other->flags & IEEE80211_SDATA_MU_MIMO_OWNER) {
+ if (other->vif.mu_mimo_owner) {
disable_mu_mimo = true;
break;
}
@@ -600,7 +567,7 @@ static void ieee80211_add_vht_ie(struct ieee80211_sub_if_data *sdata,
if (disable_mu_mimo)
cap &= ~IEEE80211_VHT_CAP_MU_BEAMFORMEE_CAPABLE;
else
- sdata->flags |= IEEE80211_SDATA_MU_MIMO_OWNER;
+ sdata->vif.mu_mimo_owner = true;
}
mask = IEEE80211_VHT_CAP_BEAMFORMEE_STS_MASK;
@@ -1379,21 +1346,26 @@ static u32 ieee80211_handle_pwr_constr(struct ieee80211_sub_if_data *sdata,
*/
if (has_80211h_pwr &&
(!has_cisco_pwr || pwr_level_80211h <= pwr_level_cisco)) {
+ new_ap_level = pwr_level_80211h;
+
+ if (sdata->ap_power_level == new_ap_level)
+ return 0;
+
sdata_dbg(sdata,
"Limiting TX power to %d (%d - %d) dBm as advertised by %pM\n",
pwr_level_80211h, chan_pwr, pwr_reduction_80211h,
sdata->u.mgd.bssid);
- new_ap_level = pwr_level_80211h;
} else { /* has_cisco_pwr is always true here. */
+ new_ap_level = pwr_level_cisco;
+
+ if (sdata->ap_power_level == new_ap_level)
+ return 0;
+
sdata_dbg(sdata,
"Limiting TX power to %d dBm as advertised by %pM\n",
pwr_level_cisco, sdata->u.mgd.bssid);
- new_ap_level = pwr_level_cisco;
}
- if (sdata->ap_power_level == new_ap_level)
- return 0;
-
sdata->ap_power_level = new_ap_level;
if (__ieee80211_recalc_txpower(sdata))
return BSS_CHANGED_TXPOWER;
@@ -1633,8 +1605,7 @@ void ieee80211_dynamic_ps_timer(unsigned long data)
void ieee80211_dfs_cac_timer_work(struct work_struct *work)
{
- struct delayed_work *delayed_work =
- container_of(work, struct delayed_work, work);
+ struct delayed_work *delayed_work = to_delayed_work(work);
struct ieee80211_sub_if_data *sdata =
container_of(delayed_work, struct ieee80211_sub_if_data,
dfs_cac_timer_work);
@@ -1930,7 +1901,8 @@ static void ieee80211_set_associated(struct ieee80211_sub_if_data *sdata,
sdata->u.mgd.flags |= IEEE80211_STA_RESET_SIGNAL_AVE;
- if (sdata->vif.p2p) {
+ if (sdata->vif.p2p ||
+ sdata->vif.driver_flags & IEEE80211_VIF_GET_NOA_UPDATE) {
const struct cfg80211_bss_ies *ies;
rcu_read_lock();
@@ -2073,7 +2045,14 @@ static void ieee80211_set_disassoc(struct ieee80211_sub_if_data *sdata,
memset(&ifmgd->ht_capa_mask, 0, sizeof(ifmgd->ht_capa_mask));
memset(&ifmgd->vht_capa, 0, sizeof(ifmgd->vht_capa));
memset(&ifmgd->vht_capa_mask, 0, sizeof(ifmgd->vht_capa_mask));
- sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER;
+
+ /* reset MU-MIMO ownership and group data */
+ memset(sdata->vif.bss_conf.mu_group.membership, 0,
+ sizeof(sdata->vif.bss_conf.mu_group.membership));
+ memset(sdata->vif.bss_conf.mu_group.position, 0,
+ sizeof(sdata->vif.bss_conf.mu_group.position));
+ changed |= BSS_CHANGED_MU_GROUPS;
+ sdata->vif.mu_mimo_owner = false;
sdata->ap_power_level = IEEE80211_UNSET_POWER_LEVEL;
@@ -2530,7 +2509,8 @@ static void ieee80211_destroy_assoc_data(struct ieee80211_sub_if_data *sdata,
eth_zero_addr(sdata->u.mgd.bssid);
ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_BSSID);
sdata->u.mgd.flags = 0;
- sdata->flags &= ~IEEE80211_SDATA_MU_MIMO_OWNER;
+ sdata->vif.mu_mimo_owner = false;
+
mutex_lock(&sdata->local->mtx);
ieee80211_vif_release_channel(sdata);
mutex_unlock(&sdata->local->mtx);
@@ -3458,7 +3438,8 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
}
}
- if (sdata->vif.p2p) {
+ if (sdata->vif.p2p ||
+ sdata->vif.driver_flags & IEEE80211_VIF_GET_NOA_UPDATE) {
struct ieee80211_p2p_noa_attr noa = {};
int ret;
@@ -3564,6 +3545,9 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
elems.ht_cap_elem, elems.ht_operation,
elems.vht_operation, bssid, &changed)) {
mutex_unlock(&local->sta_mtx);
+ sdata_info(sdata,
+ "failed to follow AP %pM bandwidth change, disconnect\n",
+ bssid);
ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
WLAN_REASON_DEAUTH_LEAVING,
true, deauth_buf);
@@ -3575,7 +3559,7 @@ static void ieee80211_rx_mgmt_beacon(struct ieee80211_sub_if_data *sdata,
if (sta && elems.opmode_notif)
ieee80211_vht_handle_opmode(sdata, sta, *elems.opmode_notif,
- rx_status->band, true);
+ rx_status->band);
mutex_unlock(&local->sta_mtx);
changed |= ieee80211_handle_pwr_constr(sdata, chan, mgmt,
@@ -3939,11 +3923,9 @@ void ieee80211_sta_work(struct ieee80211_sub_if_data *sdata)
* We actually lost the connection ... or did we?
* Let's make sure!
*/
- wiphy_debug(local->hw.wiphy,
- "%s: No probe response from AP %pM"
- " after %dms, disconnecting.\n",
- sdata->name,
- bssid, probe_wait_ms);
+ mlme_dbg(sdata,
+ "No probe response from AP %pM after %dms, disconnecting.\n",
+ bssid, probe_wait_ms);
ieee80211_sta_connection_lost(sdata, bssid,
WLAN_REASON_DISASSOC_DUE_TO_INACTIVITY, false);
@@ -3998,8 +3980,6 @@ static void ieee80211_restart_sta_timer(struct ieee80211_sub_if_data *sdata)
if (!ieee80211_hw_check(&sdata->local->hw, CONNECTION_MONITOR))
ieee80211_queue_work(&sdata->local->hw,
&sdata->u.mgd.monitor_work);
- /* and do all the other regular work too */
- ieee80211_queue_work(&sdata->local->hw, &sdata->work);
}
}
@@ -4531,6 +4511,9 @@ int ieee80211_mgd_auth(struct ieee80211_sub_if_data *sdata,
if (ifmgd->associated) {
u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
+ sdata_info(sdata,
+ "disconnect from AP %pM for new auth to %pM\n",
+ ifmgd->associated->bssid, req->bss->bssid);
ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
WLAN_REASON_UNSPECIFIED,
false, frame_buf);
@@ -4599,6 +4582,9 @@ int ieee80211_mgd_assoc(struct ieee80211_sub_if_data *sdata,
if (ifmgd->associated) {
u8 frame_buf[IEEE80211_DEAUTH_FRAME_LEN];
+ sdata_info(sdata,
+ "disconnect from AP %pM for new assoc to %pM\n",
+ ifmgd->associated->bssid, req->bss->bssid);
ieee80211_set_disassoc(sdata, IEEE80211_STYPE_DEAUTH,
WLAN_REASON_UNSPECIFIED,
false, frame_buf);
diff --git a/net/mac80211/offchannel.c b/net/mac80211/offchannel.c
index 04401037140e..55a9c5b94ce1 100644
--- a/net/mac80211/offchannel.c
+++ b/net/mac80211/offchannel.c
@@ -187,11 +187,77 @@ void ieee80211_offchannel_return(struct ieee80211_local *local)
false);
}
-void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc)
+static void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc)
{
- if (roc->notified)
+ /* was never transmitted */
+ if (roc->frame) {
+ cfg80211_mgmt_tx_status(&roc->sdata->wdev, roc->mgmt_tx_cookie,
+ roc->frame->data, roc->frame->len,
+ false, GFP_KERNEL);
+ ieee80211_free_txskb(&roc->sdata->local->hw, roc->frame);
+ }
+
+ if (!roc->mgmt_tx_cookie)
+ cfg80211_remain_on_channel_expired(&roc->sdata->wdev,
+ roc->cookie, roc->chan,
+ GFP_KERNEL);
+
+ list_del(&roc->list);
+ kfree(roc);
+}
+
+static unsigned long ieee80211_end_finished_rocs(struct ieee80211_local *local,
+ unsigned long now)
+{
+ struct ieee80211_roc_work *roc, *tmp;
+ long remaining_dur_min = LONG_MAX;
+
+ lockdep_assert_held(&local->mtx);
+
+ list_for_each_entry_safe(roc, tmp, &local->roc_list, list) {
+ long remaining;
+
+ if (!roc->started)
+ break;
+
+ remaining = roc->start_time +
+ msecs_to_jiffies(roc->duration) -
+ now;
+
+ /* In case of HW ROC, it is possible that the HW finished the
+ * ROC session before the actual requested time. In such a case
+ * end the ROC session (disregarding the remaining time).
+ */
+ if (roc->abort || roc->hw_begun || remaining <= 0)
+ ieee80211_roc_notify_destroy(roc);
+ else
+ remaining_dur_min = min(remaining_dur_min, remaining);
+ }
+
+ return remaining_dur_min;
+}
+
+static bool ieee80211_recalc_sw_work(struct ieee80211_local *local,
+ unsigned long now)
+{
+ long dur = ieee80211_end_finished_rocs(local, now);
+
+ if (dur == LONG_MAX)
+ return false;
+
+ mod_delayed_work(local->workqueue, &local->roc_work, dur);
+ return true;
+}
+
+static void ieee80211_handle_roc_started(struct ieee80211_roc_work *roc,
+ unsigned long start_time)
+{
+ if (WARN_ON(roc->notified))
return;
+ roc->start_time = start_time;
+ roc->started = true;
+
if (roc->mgmt_tx_cookie) {
if (!WARN_ON(!roc->frame)) {
ieee80211_tx_skb_tid_band(roc->sdata, roc->frame, 7,
@@ -211,34 +277,18 @@ static void ieee80211_hw_roc_start(struct work_struct *work)
{
struct ieee80211_local *local =
container_of(work, struct ieee80211_local, hw_roc_start);
- struct ieee80211_roc_work *roc, *dep, *tmp;
+ struct ieee80211_roc_work *roc;
mutex_lock(&local->mtx);
- if (list_empty(&local->roc_list))
- goto out_unlock;
+ list_for_each_entry(roc, &local->roc_list, list) {
+ if (!roc->started)
+ break;
- roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work,
- list);
-
- if (!roc->started)
- goto out_unlock;
-
- roc->hw_begun = true;
- roc->hw_start_time = local->hw_roc_start_time;
-
- ieee80211_handle_roc_started(roc);
- list_for_each_entry_safe(dep, tmp, &roc->dependents, list) {
- ieee80211_handle_roc_started(dep);
-
- if (dep->duration > roc->duration) {
- u32 dur = dep->duration;
- dep->duration = dur - roc->duration;
- roc->duration = dur;
- list_move(&dep->list, &roc->list);
- }
+ roc->hw_begun = true;
+ ieee80211_handle_roc_started(roc, local->hw_roc_start_time);
}
- out_unlock:
+
mutex_unlock(&local->mtx);
}
@@ -254,34 +304,40 @@ void ieee80211_ready_on_channel(struct ieee80211_hw *hw)
}
EXPORT_SYMBOL_GPL(ieee80211_ready_on_channel);
-void ieee80211_start_next_roc(struct ieee80211_local *local)
+static void _ieee80211_start_next_roc(struct ieee80211_local *local)
{
- struct ieee80211_roc_work *roc;
+ struct ieee80211_roc_work *roc, *tmp;
+ enum ieee80211_roc_type type;
+ u32 min_dur, max_dur;
lockdep_assert_held(&local->mtx);
- if (list_empty(&local->roc_list)) {
- ieee80211_run_deferred_scan(local);
+ if (WARN_ON(list_empty(&local->roc_list)))
return;
- }
roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work,
list);
- if (WARN_ON_ONCE(roc->started))
+ if (WARN_ON(roc->started))
return;
- if (local->ops->remain_on_channel) {
- int ret, duration = roc->duration;
-
- /* XXX: duplicated, see ieee80211_start_roc_work() */
- if (!duration)
- duration = 10;
+ min_dur = roc->duration;
+ max_dur = roc->duration;
+ type = roc->type;
- ret = drv_remain_on_channel(local, roc->sdata, roc->chan,
- duration, roc->type);
+ list_for_each_entry(tmp, &local->roc_list, list) {
+ if (tmp == roc)
+ continue;
+ if (tmp->sdata != roc->sdata || tmp->chan != roc->chan)
+ break;
+ max_dur = max(tmp->duration, max_dur);
+ min_dur = min(tmp->duration, min_dur);
+ type = max(tmp->type, type);
+ }
- roc->started = true;
+ if (local->ops->remain_on_channel) {
+ int ret = drv_remain_on_channel(local, roc->sdata, roc->chan,
+ max_dur, type);
if (ret) {
wiphy_warn(local->hw.wiphy,
@@ -290,74 +346,24 @@ void ieee80211_start_next_roc(struct ieee80211_local *local)
* queue the work struct again to avoid recursion
* when multiple failures occur
*/
- ieee80211_remain_on_channel_expired(&local->hw);
+ list_for_each_entry(tmp, &local->roc_list, list) {
+ if (tmp->sdata != roc->sdata ||
+ tmp->chan != roc->chan)
+ break;
+ tmp->started = true;
+ tmp->abort = true;
+ }
+ ieee80211_queue_work(&local->hw, &local->hw_roc_done);
+ return;
}
- } else {
- /* delay it a bit */
- ieee80211_queue_delayed_work(&local->hw, &roc->work,
- round_jiffies_relative(HZ/2));
- }
-}
-
-void ieee80211_roc_notify_destroy(struct ieee80211_roc_work *roc, bool free)
-{
- struct ieee80211_roc_work *dep, *tmp;
-
- if (WARN_ON(roc->to_be_freed))
- return;
-
- /* was never transmitted */
- if (roc->frame) {
- cfg80211_mgmt_tx_status(&roc->sdata->wdev,
- (unsigned long)roc->frame,
- roc->frame->data, roc->frame->len,
- false, GFP_KERNEL);
- kfree_skb(roc->frame);
- }
-
- if (!roc->mgmt_tx_cookie)
- cfg80211_remain_on_channel_expired(&roc->sdata->wdev,
- roc->cookie, roc->chan,
- GFP_KERNEL);
-
- list_for_each_entry_safe(dep, tmp, &roc->dependents, list)
- ieee80211_roc_notify_destroy(dep, true);
-
- if (free)
- kfree(roc);
- else
- roc->to_be_freed = true;
-}
-
-void ieee80211_sw_roc_work(struct work_struct *work)
-{
- struct ieee80211_roc_work *roc =
- container_of(work, struct ieee80211_roc_work, work.work);
- struct ieee80211_sub_if_data *sdata = roc->sdata;
- struct ieee80211_local *local = sdata->local;
- bool started, on_channel;
-
- mutex_lock(&local->mtx);
-
- if (roc->to_be_freed)
- goto out_unlock;
-
- if (roc->abort)
- goto finish;
-
- if (WARN_ON(list_empty(&local->roc_list)))
- goto out_unlock;
-
- if (WARN_ON(roc != list_first_entry(&local->roc_list,
- struct ieee80211_roc_work,
- list)))
- goto out_unlock;
-
- if (!roc->started) {
- struct ieee80211_roc_work *dep;
-
- WARN_ON(local->use_chanctx);
+ /* we'll notify about the start once the HW calls back */
+ list_for_each_entry(tmp, &local->roc_list, list) {
+ if (tmp->sdata != roc->sdata || tmp->chan != roc->chan)
+ break;
+ tmp->started = true;
+ }
+ } else {
/* If actually operating on the desired channel (with at least
* 20 MHz channel width) don't stop all the operations but still
* treat it as though the ROC operation started properly, so
@@ -377,27 +383,76 @@ void ieee80211_sw_roc_work(struct work_struct *work)
ieee80211_hw_config(local, 0);
}
- /* tell userspace or send frame */
- ieee80211_handle_roc_started(roc);
- list_for_each_entry(dep, &roc->dependents, list)
- ieee80211_handle_roc_started(dep);
+ ieee80211_queue_delayed_work(&local->hw, &local->roc_work,
+ msecs_to_jiffies(min_dur));
+
+ /* tell userspace or send frame(s) */
+ list_for_each_entry(tmp, &local->roc_list, list) {
+ if (tmp->sdata != roc->sdata || tmp->chan != roc->chan)
+ break;
- /* if it was pure TX, just finish right away */
- if (!roc->duration)
- goto finish;
+ tmp->on_channel = roc->on_channel;
+ ieee80211_handle_roc_started(tmp, jiffies);
+ }
+ }
+}
- roc->started = true;
- ieee80211_queue_delayed_work(&local->hw, &roc->work,
- msecs_to_jiffies(roc->duration));
+void ieee80211_start_next_roc(struct ieee80211_local *local)
+{
+ struct ieee80211_roc_work *roc;
+
+ lockdep_assert_held(&local->mtx);
+
+ if (list_empty(&local->roc_list)) {
+ ieee80211_run_deferred_scan(local);
+ return;
+ }
+
+ /* defer roc if driver is not started (i.e. during reconfig) */
+ if (local->in_reconfig)
+ return;
+
+ roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work,
+ list);
+
+ if (WARN_ON_ONCE(roc->started))
+ return;
+
+ if (local->ops->remain_on_channel) {
+ _ieee80211_start_next_roc(local);
+ } else {
+ /* delay it a bit */
+ ieee80211_queue_delayed_work(&local->hw, &local->roc_work,
+ round_jiffies_relative(HZ/2));
+ }
+}
+
+static void __ieee80211_roc_work(struct ieee80211_local *local)
+{
+ struct ieee80211_roc_work *roc;
+ bool on_channel;
+
+ lockdep_assert_held(&local->mtx);
+
+ if (WARN_ON(local->ops->remain_on_channel))
+ return;
+
+ roc = list_first_entry_or_null(&local->roc_list,
+ struct ieee80211_roc_work, list);
+ if (!roc)
+ return;
+
+ if (!roc->started) {
+ WARN_ON(local->use_chanctx);
+ _ieee80211_start_next_roc(local);
} else {
- /* finish this ROC */
- finish:
- list_del(&roc->list);
- started = roc->started;
on_channel = roc->on_channel;
- ieee80211_roc_notify_destroy(roc, !roc->abort);
+ if (ieee80211_recalc_sw_work(local, jiffies))
+ return;
+
+ /* careful - roc pointer became invalid during recalc */
- if (started && !on_channel) {
+ if (!on_channel) {
ieee80211_flush_queues(local, NULL, false);
local->tmp_channel = NULL;
@@ -407,14 +462,17 @@ void ieee80211_sw_roc_work(struct work_struct *work)
}
ieee80211_recalc_idle(local);
-
- if (started)
- ieee80211_start_next_roc(local);
- else if (list_empty(&local->roc_list))
- ieee80211_run_deferred_scan(local);
+ ieee80211_start_next_roc(local);
}
+}
- out_unlock:
+static void ieee80211_roc_work(struct work_struct *work)
+{
+ struct ieee80211_local *local =
+ container_of(work, struct ieee80211_local, roc_work.work);
+
+ mutex_lock(&local->mtx);
+ __ieee80211_roc_work(local);
mutex_unlock(&local->mtx);
}
@@ -422,27 +480,14 @@ static void ieee80211_hw_roc_done(struct work_struct *work)
{
struct ieee80211_local *local =
container_of(work, struct ieee80211_local, hw_roc_done);
- struct ieee80211_roc_work *roc;
mutex_lock(&local->mtx);
- if (list_empty(&local->roc_list))
- goto out_unlock;
-
- roc = list_first_entry(&local->roc_list, struct ieee80211_roc_work,
- list);
-
- if (!roc->started)
- goto out_unlock;
-
- list_del(&roc->list);
-
- ieee80211_roc_notify_destroy(roc, true);
+ ieee80211_end_finished_rocs(local, jiffies);
/* if there's another roc, start it now */
ieee80211_start_next_roc(local);
- out_unlock:
mutex_unlock(&local->mtx);
}
@@ -456,47 +501,503 @@ void ieee80211_remain_on_channel_expired(struct ieee80211_hw *hw)
}
EXPORT_SYMBOL_GPL(ieee80211_remain_on_channel_expired);
-void ieee80211_roc_setup(struct ieee80211_local *local)
+static bool
+ieee80211_coalesce_hw_started_roc(struct ieee80211_local *local,
+ struct ieee80211_roc_work *new_roc,
+ struct ieee80211_roc_work *cur_roc)
{
- INIT_WORK(&local->hw_roc_start, ieee80211_hw_roc_start);
- INIT_WORK(&local->hw_roc_done, ieee80211_hw_roc_done);
- INIT_LIST_HEAD(&local->roc_list);
+ unsigned long now = jiffies;
+ unsigned long remaining;
+
+ if (WARN_ON(!cur_roc->started))
+ return false;
+
+ /* if it was scheduled in the hardware, but not started yet,
+ * we can only combine if the older one had a longer duration
+ */
+ if (!cur_roc->hw_begun && new_roc->duration > cur_roc->duration)
+ return false;
+
+ remaining = cur_roc->start_time +
+ msecs_to_jiffies(cur_roc->duration) -
+ now;
+
+ /* if it doesn't fit entirely, schedule a new one */
+ if (new_roc->duration > jiffies_to_msecs(remaining))
+ return false;
+
+ /* add just after the current one so we combine their finish later */
+ list_add(&new_roc->list, &cur_roc->list);
+
+ /* if the existing one has already begun then let this one also
+ * begin, otherwise they'll both be marked properly by the work
+ * struct that runs once the driver notifies us of the beginning
+ */
+ if (cur_roc->hw_begun) {
+ new_roc->hw_begun = true;
+ ieee80211_handle_roc_started(new_roc, now);
+ }
+
+ return true;
}
-void ieee80211_roc_purge(struct ieee80211_local *local,
- struct ieee80211_sub_if_data *sdata)
+static int ieee80211_start_roc_work(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_channel *channel,
+ unsigned int duration, u64 *cookie,
+ struct sk_buff *txskb,
+ enum ieee80211_roc_type type)
{
struct ieee80211_roc_work *roc, *tmp;
- LIST_HEAD(tmp_list);
+ bool queued = false, combine_started = true;
+ int ret;
+
+ lockdep_assert_held(&local->mtx);
+
+ if (local->use_chanctx && !local->ops->remain_on_channel)
+ return -EOPNOTSUPP;
+
+ roc = kzalloc(sizeof(*roc), GFP_KERNEL);
+ if (!roc)
+ return -ENOMEM;
+
+ /*
+ * If the duration is zero, then the driver
+ * wouldn't actually do anything. Set it to
+ * 10 for now.
+ *
+ * TODO: cancel the off-channel operation
+ * when we get the SKB's TX status and
+ * the wait time was zero before.
+ */
+ if (!duration)
+ duration = 10;
+
+ roc->chan = channel;
+ roc->duration = duration;
+ roc->req_duration = duration;
+ roc->frame = txskb;
+ roc->type = type;
+ roc->sdata = sdata;
+
+ /*
+ * cookie is either the roc cookie (for normal roc)
+ * or the SKB (for mgmt TX)
+ */
+ if (!txskb) {
+ roc->cookie = ieee80211_mgmt_tx_cookie(local);
+ *cookie = roc->cookie;
+ } else {
+ roc->mgmt_tx_cookie = *cookie;
+ }
+
+ /* if there's no need to queue, handle it immediately */
+ if (list_empty(&local->roc_list) &&
+ !local->scanning && !ieee80211_is_radar_required(local)) {
+ /* if not HW assist, just queue & schedule work */
+ if (!local->ops->remain_on_channel) {
+ list_add_tail(&roc->list, &local->roc_list);
+ ieee80211_queue_delayed_work(&local->hw,
+ &local->roc_work, 0);
+ } else {
+ /* otherwise actually kick it off here
+ * (for error handling)
+ */
+ ret = drv_remain_on_channel(local, sdata, channel,
+ duration, type);
+ if (ret) {
+ kfree(roc);
+ return ret;
+ }
+ roc->started = true;
+ list_add_tail(&roc->list, &local->roc_list);
+ }
+
+ return 0;
+ }
+
+ /* otherwise handle queueing */
+
+ list_for_each_entry(tmp, &local->roc_list, list) {
+ if (tmp->chan != channel || tmp->sdata != sdata)
+ continue;
+
+ /*
+ * Extend this ROC if possible: If it hasn't started, add
+ * just after the new one to combine.
+ */
+ if (!tmp->started) {
+ list_add(&roc->list, &tmp->list);
+ queued = true;
+ break;
+ }
+
+ if (!combine_started)
+ continue;
+
+ if (!local->ops->remain_on_channel) {
+ /* If there's no hardware remain-on-channel, and
+ * doing so won't push us over the maximum r-o-c
+ * we allow, then we can just add the new one to
+ * the list and mark it as having started now.
+ * If it would push over the limit, don't try to
+ * combine with other started ones (that haven't
+ * been running as long) but potentially sort it
+ * with others that had the same fate.
+ */
+ unsigned long now = jiffies;
+ u32 elapsed = jiffies_to_msecs(now - tmp->start_time);
+ struct wiphy *wiphy = local->hw.wiphy;
+ u32 max_roc = wiphy->max_remain_on_channel_duration;
+
+ if (elapsed + roc->duration > max_roc) {
+ combine_started = false;
+ continue;
+ }
+
+ list_add(&roc->list, &tmp->list);
+ queued = true;
+ roc->on_channel = tmp->on_channel;
+ ieee80211_handle_roc_started(roc, now);
+ ieee80211_recalc_sw_work(local, now);
+ break;
+ }
+
+ queued = ieee80211_coalesce_hw_started_roc(local, roc, tmp);
+ if (queued)
+ break;
+ /* if it wasn't queued, perhaps it can be combined with
+ * another that also couldn't get combined previously,
+ * but no need to check for already started ones, since
+ * that can't work.
+ */
+ combine_started = false;
+ }
+
+ if (!queued)
+ list_add_tail(&roc->list, &local->roc_list);
+
+ return 0;
+}
+
+int ieee80211_remain_on_channel(struct wiphy *wiphy, struct wireless_dev *wdev,
+ struct ieee80211_channel *chan,
+ unsigned int duration, u64 *cookie)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ struct ieee80211_local *local = sdata->local;
+ int ret;
+
+ mutex_lock(&local->mtx);
+ ret = ieee80211_start_roc_work(local, sdata, chan,
+ duration, cookie, NULL,
+ IEEE80211_ROC_TYPE_NORMAL);
+ mutex_unlock(&local->mtx);
+
+ return ret;
+}
+
+static int ieee80211_cancel_roc(struct ieee80211_local *local,
+ u64 cookie, bool mgmt_tx)
+{
+ struct ieee80211_roc_work *roc, *tmp, *found = NULL;
+ int ret;
+
+ if (!cookie)
+ return -ENOENT;
mutex_lock(&local->mtx);
list_for_each_entry_safe(roc, tmp, &local->roc_list, list) {
- if (sdata && roc->sdata != sdata)
+ if (!mgmt_tx && roc->cookie != cookie)
+ continue;
+ else if (mgmt_tx && roc->mgmt_tx_cookie != cookie)
continue;
- if (roc->started && local->ops->remain_on_channel) {
- /* can race, so ignore return value */
- drv_cancel_remain_on_channel(local);
+ found = roc;
+ break;
+ }
+
+ if (!found) {
+ mutex_unlock(&local->mtx);
+ return -ENOENT;
+ }
+
+ if (!found->started) {
+ ieee80211_roc_notify_destroy(found);
+ goto out_unlock;
+ }
+
+ if (local->ops->remain_on_channel) {
+ ret = drv_cancel_remain_on_channel(local);
+ if (WARN_ON_ONCE(ret)) {
+ mutex_unlock(&local->mtx);
+ return ret;
}
- list_move_tail(&roc->list, &tmp_list);
- roc->abort = true;
+ /* TODO:
+ * if multiple items were combined here then we really shouldn't
+ * cancel them all - we should wait for as much time as needed
+ * for the longest remaining one, and only then cancel ...
+ */
+ list_for_each_entry_safe(roc, tmp, &local->roc_list, list) {
+ if (!roc->started)
+ break;
+ if (roc == found)
+ found = NULL;
+ ieee80211_roc_notify_destroy(roc);
+ }
+
+ /* that really must not happen - it was started */
+ WARN_ON(found);
+
+ ieee80211_start_next_roc(local);
+ } else {
+ /* go through work struct to return to the operating channel */
+ found->abort = true;
+ mod_delayed_work(local->workqueue, &local->roc_work, 0);
}
+
+ out_unlock:
mutex_unlock(&local->mtx);
- list_for_each_entry_safe(roc, tmp, &tmp_list, list) {
- if (local->ops->remain_on_channel) {
- list_del(&roc->list);
- ieee80211_roc_notify_destroy(roc, true);
+ return 0;
+}
+
+int ieee80211_cancel_remain_on_channel(struct wiphy *wiphy,
+ struct wireless_dev *wdev, u64 cookie)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ struct ieee80211_local *local = sdata->local;
+
+ return ieee80211_cancel_roc(local, cookie, false);
+}
+
+int ieee80211_mgmt_tx(struct wiphy *wiphy, struct wireless_dev *wdev,
+ struct cfg80211_mgmt_tx_params *params, u64 *cookie)
+{
+ struct ieee80211_sub_if_data *sdata = IEEE80211_WDEV_TO_SUB_IF(wdev);
+ struct ieee80211_local *local = sdata->local;
+ struct sk_buff *skb;
+ struct sta_info *sta;
+ const struct ieee80211_mgmt *mgmt = (void *)params->buf;
+ bool need_offchan = false;
+ u32 flags;
+ int ret;
+ u8 *data;
+
+ if (params->dont_wait_for_ack)
+ flags = IEEE80211_TX_CTL_NO_ACK;
+ else
+ flags = IEEE80211_TX_INTFL_NL80211_FRAME_TX |
+ IEEE80211_TX_CTL_REQ_TX_STATUS;
+
+ if (params->no_cck)
+ flags |= IEEE80211_TX_CTL_NO_CCK_RATE;
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_ADHOC:
+ if (!sdata->vif.bss_conf.ibss_joined)
+ need_offchan = true;
+ /* fall through */
+#ifdef CONFIG_MAC80211_MESH
+ case NL80211_IFTYPE_MESH_POINT:
+ if (ieee80211_vif_is_mesh(&sdata->vif) &&
+ !sdata->u.mesh.mesh_id_len)
+ need_offchan = true;
+ /* fall through */
+#endif
+ case NL80211_IFTYPE_AP:
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_P2P_GO:
+ if (sdata->vif.type != NL80211_IFTYPE_ADHOC &&
+ !ieee80211_vif_is_mesh(&sdata->vif) &&
+ !rcu_access_pointer(sdata->bss->beacon))
+ need_offchan = true;
+ if (!ieee80211_is_action(mgmt->frame_control) ||
+ mgmt->u.action.category == WLAN_CATEGORY_PUBLIC ||
+ mgmt->u.action.category == WLAN_CATEGORY_SELF_PROTECTED ||
+ mgmt->u.action.category == WLAN_CATEGORY_SPECTRUM_MGMT)
+ break;
+ rcu_read_lock();
+ sta = sta_info_get(sdata, mgmt->da);
+ rcu_read_unlock();
+ if (!sta)
+ return -ENOLINK;
+ break;
+ case NL80211_IFTYPE_STATION:
+ case NL80211_IFTYPE_P2P_CLIENT:
+ sdata_lock(sdata);
+ if (!sdata->u.mgd.associated ||
+ (params->offchan && params->wait &&
+ local->ops->remain_on_channel &&
+ memcmp(sdata->u.mgd.associated->bssid,
+ mgmt->bssid, ETH_ALEN)))
+ need_offchan = true;
+ sdata_unlock(sdata);
+ break;
+ case NL80211_IFTYPE_P2P_DEVICE:
+ need_offchan = true;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ /* configurations requiring offchan cannot work if no channel has been
+ * specified
+ */
+ if (need_offchan && !params->chan)
+ return -EINVAL;
+
+ mutex_lock(&local->mtx);
+
+ /* Check if the operating channel is the requested channel */
+ if (!need_offchan) {
+ struct ieee80211_chanctx_conf *chanctx_conf;
+
+ rcu_read_lock();
+ chanctx_conf = rcu_dereference(sdata->vif.chanctx_conf);
+
+ if (chanctx_conf) {
+ need_offchan = params->chan &&
+ (params->chan !=
+ chanctx_conf->def.chan);
+ } else if (!params->chan) {
+ ret = -EINVAL;
+ rcu_read_unlock();
+ goto out_unlock;
} else {
- ieee80211_queue_delayed_work(&local->hw, &roc->work, 0);
+ need_offchan = true;
+ }
+ rcu_read_unlock();
+ }
+
+ if (need_offchan && !params->offchan) {
+ ret = -EBUSY;
+ goto out_unlock;
+ }
+
+ skb = dev_alloc_skb(local->hw.extra_tx_headroom + params->len);
+ if (!skb) {
+ ret = -ENOMEM;
+ goto out_unlock;
+ }
+ skb_reserve(skb, local->hw.extra_tx_headroom);
+
+ data = skb_put(skb, params->len);
+ memcpy(data, params->buf, params->len);
+
+ /* Update CSA counters */
+ if (sdata->vif.csa_active &&
+ (sdata->vif.type == NL80211_IFTYPE_AP ||
+ sdata->vif.type == NL80211_IFTYPE_MESH_POINT ||
+ sdata->vif.type == NL80211_IFTYPE_ADHOC) &&
+ params->n_csa_offsets) {
+ int i;
+ struct beacon_data *beacon = NULL;
+
+ rcu_read_lock();
+
+ if (sdata->vif.type == NL80211_IFTYPE_AP)
+ beacon = rcu_dereference(sdata->u.ap.beacon);
+ else if (sdata->vif.type == NL80211_IFTYPE_ADHOC)
+ beacon = rcu_dereference(sdata->u.ibss.presp);
+ else if (ieee80211_vif_is_mesh(&sdata->vif))
+ beacon = rcu_dereference(sdata->u.mesh.beacon);
+
+ if (beacon)
+ for (i = 0; i < params->n_csa_offsets; i++)
+ data[params->csa_offsets[i]] =
+ beacon->csa_current_counter;
+
+ rcu_read_unlock();
+ }
- /* work will clean up etc */
- flush_delayed_work(&roc->work);
- WARN_ON(!roc->to_be_freed);
- kfree(roc);
+ IEEE80211_SKB_CB(skb)->flags = flags;
+
+ skb->dev = sdata->dev;
+
+ if (!params->dont_wait_for_ack) {
+ /* make a copy to preserve the frame contents
+ * in case of encryption.
+ */
+ ret = ieee80211_attach_ack_skb(local, skb, cookie, GFP_KERNEL);
+ if (ret) {
+ kfree_skb(skb);
+ goto out_unlock;
}
+ } else {
+ /* Assign a dummy non-zero cookie, it's not sent to
+ * userspace in this case but we rely on its value
+ * internally in the need_offchan case to distinguish
+ * mgmt-tx from remain-on-channel.
+ */
+ *cookie = 0xffffffff;
}
- WARN_ON_ONCE(!list_empty(&tmp_list));
+ if (!need_offchan) {
+ ieee80211_tx_skb(sdata, skb);
+ ret = 0;
+ goto out_unlock;
+ }
+
+ IEEE80211_SKB_CB(skb)->flags |= IEEE80211_TX_CTL_TX_OFFCHAN |
+ IEEE80211_TX_INTFL_OFFCHAN_TX_OK;
+ if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL))
+ IEEE80211_SKB_CB(skb)->hw_queue =
+ local->hw.offchannel_tx_hw_queue;
+
+ /* This will handle all kinds of coalescing and immediate TX */
+ ret = ieee80211_start_roc_work(local, sdata, params->chan,
+ params->wait, cookie, skb,
+ IEEE80211_ROC_TYPE_MGMT_TX);
+ if (ret)
+ ieee80211_free_txskb(&local->hw, skb);
+ out_unlock:
+ mutex_unlock(&local->mtx);
+ return ret;
+}
+
+int ieee80211_mgmt_tx_cancel_wait(struct wiphy *wiphy,
+ struct wireless_dev *wdev, u64 cookie)
+{
+ struct ieee80211_local *local = wiphy_priv(wiphy);
+
+ return ieee80211_cancel_roc(local, cookie, true);
+}
+
+void ieee80211_roc_setup(struct ieee80211_local *local)
+{
+ INIT_WORK(&local->hw_roc_start, ieee80211_hw_roc_start);
+ INIT_WORK(&local->hw_roc_done, ieee80211_hw_roc_done);
+ INIT_DELAYED_WORK(&local->roc_work, ieee80211_roc_work);
+ INIT_LIST_HEAD(&local->roc_list);
+}
+
+void ieee80211_roc_purge(struct ieee80211_local *local,
+ struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_roc_work *roc, *tmp;
+ bool work_to_do = false;
+
+ mutex_lock(&local->mtx);
+ list_for_each_entry_safe(roc, tmp, &local->roc_list, list) {
+ if (sdata && roc->sdata != sdata)
+ continue;
+
+ if (roc->started) {
+ if (local->ops->remain_on_channel) {
+ /* can race, so ignore return value */
+ drv_cancel_remain_on_channel(local);
+ ieee80211_roc_notify_destroy(roc);
+ } else {
+ roc->abort = true;
+ work_to_do = true;
+ }
+ } else {
+ ieee80211_roc_notify_destroy(roc);
+ }
+ }
+ if (work_to_do)
+ __ieee80211_roc_work(local);
+ mutex_unlock(&local->mtx);
}
diff --git a/net/mac80211/rc80211_minstrel.c b/net/mac80211/rc80211_minstrel.c
index 3ece7d1034c8..b54f398cda5d 100644
--- a/net/mac80211/rc80211_minstrel.c
+++ b/net/mac80211/rc80211_minstrel.c
@@ -711,7 +711,7 @@ static u32 minstrel_get_expected_throughput(void *priv_sta)
* computing cur_tp
*/
tmp_mrs = &mi->r[idx].stats;
- tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma);
+ tmp_cur_tp = minstrel_get_tp_avg(&mi->r[idx], tmp_mrs->prob_ewma) * 10;
tmp_cur_tp = tmp_cur_tp * 1200 * 8 / 1024;
return tmp_cur_tp;
diff --git a/net/mac80211/rc80211_minstrel_ht.c b/net/mac80211/rc80211_minstrel_ht.c
index 3928dbd24e25..370d677b547b 100644
--- a/net/mac80211/rc80211_minstrel_ht.c
+++ b/net/mac80211/rc80211_minstrel_ht.c
@@ -414,15 +414,16 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index)
(max_tp_group != MINSTREL_CCK_GROUP))
return;
+ max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES;
+ max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
+ max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma;
+
if (mrs->prob_ewma > MINSTREL_FRAC(75, 100)) {
cur_tp_avg = minstrel_ht_get_tp_avg(mi, cur_group, cur_idx,
mrs->prob_ewma);
if (cur_tp_avg > tmp_tp_avg)
mi->max_prob_rate = index;
- max_gpr_group = mg->max_group_prob_rate / MCS_GROUP_RATES;
- max_gpr_idx = mg->max_group_prob_rate % MCS_GROUP_RATES;
- max_gpr_prob = mi->groups[max_gpr_group].rates[max_gpr_idx].prob_ewma;
max_gpr_tp_avg = minstrel_ht_get_tp_avg(mi, max_gpr_group,
max_gpr_idx,
max_gpr_prob);
@@ -431,7 +432,7 @@ minstrel_ht_set_best_prob_rate(struct minstrel_ht_sta *mi, u16 index)
} else {
if (mrs->prob_ewma > tmp_prob)
mi->max_prob_rate = index;
- if (mrs->prob_ewma > mg->rates[mg->max_group_prob_rate].prob_ewma)
+ if (mrs->prob_ewma > max_gpr_prob)
mg->max_group_prob_rate = index;
}
}
@@ -691,7 +692,7 @@ minstrel_aggr_check(struct ieee80211_sta *pubsta, struct sk_buff *skb)
if (likely(sta->ampdu_mlme.tid_tx[tid]))
return;
- ieee80211_start_tx_ba_session(pubsta, tid, 5000);
+ ieee80211_start_tx_ba_session(pubsta, tid, 0);
}
static void
@@ -871,7 +872,7 @@ minstrel_ht_set_rate(struct minstrel_priv *mp, struct minstrel_ht_sta *mi,
* - if station is in dynamic SMPS (and streams > 1)
* - for fallback rates, to increase chances of getting through
*/
- if (offset > 0 &&
+ if (offset > 0 ||
(mi->sta->smps_mode == IEEE80211_SMPS_DYNAMIC &&
group->streams > 1)) {
ratetbl->rate[offset].count = ratetbl->rate[offset].count_rts;
@@ -1334,7 +1335,8 @@ static u32 minstrel_ht_get_expected_throughput(void *priv_sta)
prob = mi->groups[i].rates[j].prob_ewma;
/* convert tp_avg from pkt per second in kbps */
- tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * AVG_PKT_SIZE * 8 / 1024;
+ tp_avg = minstrel_ht_get_tp_avg(mi, i, j, prob) * 10;
+ tp_avg = tp_avg * AVG_PKT_SIZE * 8 / 1024;
return tp_avg;
}
diff --git a/net/mac80211/rx.c b/net/mac80211/rx.c
index 8bae5de0dc44..dc27becb9b71 100644
--- a/net/mac80211/rx.c
+++ b/net/mac80211/rx.c
@@ -4,6 +4,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright(c) 2015 - 2016 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -18,6 +19,7 @@
#include <linux/etherdevice.h>
#include <linux/rcupdate.h>
#include <linux/export.h>
+#include <linux/bitops.h>
#include <net/mac80211.h>
#include <net/ieee80211_radiotap.h>
#include <asm/unaligned.h>
@@ -122,7 +124,8 @@ static inline bool should_drop_frame(struct sk_buff *skb, int present_fcs_len,
hdr = (void *)(skb->data + rtap_vendor_space);
if (status->flag & (RX_FLAG_FAILED_FCS_CRC |
- RX_FLAG_FAILED_PLCP_CRC))
+ RX_FLAG_FAILED_PLCP_CRC |
+ RX_FLAG_ONLY_MONITOR))
return true;
if (unlikely(skb->len < 16 + present_fcs_len + rtap_vendor_space))
@@ -507,7 +510,7 @@ ieee80211_rx_monitor(struct ieee80211_local *local, struct sk_buff *origskb,
return NULL;
}
- if (!local->monitors) {
+ if (!local->monitors || (status->flag & RX_FLAG_SKIP_MONITOR)) {
if (should_drop_frame(origskb, present_fcs_len,
rtap_vendor_space)) {
dev_kfree_skb(origskb);
@@ -661,8 +664,7 @@ static void ieee80211_parse_qos(struct ieee80211_rx_data *rx)
static void ieee80211_verify_alignment(struct ieee80211_rx_data *rx)
{
#ifdef CONFIG_MAC80211_VERBOSE_DEBUG
- WARN_ONCE((unsigned long)rx->skb->data & 1,
- "unaligned packet at 0x%p\n", rx->skb->data);
+ WARN_ON_ONCE((unsigned long)rx->skb->data & 1);
#endif
}
@@ -798,6 +800,26 @@ static ieee80211_rx_result ieee80211_rx_mesh_check(struct ieee80211_rx_data *rx)
return RX_CONTINUE;
}
+static inline bool ieee80211_rx_reorder_ready(struct tid_ampdu_rx *tid_agg_rx,
+ int index)
+{
+ struct sk_buff_head *frames = &tid_agg_rx->reorder_buf[index];
+ struct sk_buff *tail = skb_peek_tail(frames);
+ struct ieee80211_rx_status *status;
+
+ if (tid_agg_rx->reorder_buf_filtered & BIT_ULL(index))
+ return true;
+
+ if (!tail)
+ return false;
+
+ status = IEEE80211_SKB_RXCB(tail);
+ if (status->flag & RX_FLAG_AMSDU_MORE)
+ return false;
+
+ return true;
+}
+
static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata,
struct tid_ampdu_rx *tid_agg_rx,
int index,
@@ -812,7 +834,7 @@ static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata,
if (skb_queue_empty(skb_list))
goto no_frame;
- if (!ieee80211_rx_reorder_ready(skb_list)) {
+ if (!ieee80211_rx_reorder_ready(tid_agg_rx, index)) {
__skb_queue_purge(skb_list);
goto no_frame;
}
@@ -826,6 +848,7 @@ static void ieee80211_release_reorder_frame(struct ieee80211_sub_if_data *sdata,
}
no_frame:
+ tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index);
tid_agg_rx->head_seq_num = ieee80211_sn_inc(tid_agg_rx->head_seq_num);
}
@@ -866,7 +889,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata,
/* release the buffer until next missing frame */
index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;
- if (!ieee80211_rx_reorder_ready(&tid_agg_rx->reorder_buf[index]) &&
+ if (!ieee80211_rx_reorder_ready(tid_agg_rx, index) &&
tid_agg_rx->stored_mpdu_num) {
/*
* No buffers ready to be released, but check whether any
@@ -875,8 +898,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata,
int skipped = 1;
for (j = (index + 1) % tid_agg_rx->buf_size; j != index;
j = (j + 1) % tid_agg_rx->buf_size) {
- if (!ieee80211_rx_reorder_ready(
- &tid_agg_rx->reorder_buf[j])) {
+ if (!ieee80211_rx_reorder_ready(tid_agg_rx, j)) {
skipped++;
continue;
}
@@ -903,8 +925,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata,
skipped) & IEEE80211_SN_MASK;
skipped = 0;
}
- } else while (ieee80211_rx_reorder_ready(
- &tid_agg_rx->reorder_buf[index])) {
+ } else while (ieee80211_rx_reorder_ready(tid_agg_rx, index)) {
ieee80211_release_reorder_frame(sdata, tid_agg_rx, index,
frames);
index = tid_agg_rx->head_seq_num % tid_agg_rx->buf_size;
@@ -915,8 +936,7 @@ static void ieee80211_sta_reorder_release(struct ieee80211_sub_if_data *sdata,
for (; j != (index - 1) % tid_agg_rx->buf_size;
j = (j + 1) % tid_agg_rx->buf_size) {
- if (ieee80211_rx_reorder_ready(
- &tid_agg_rx->reorder_buf[j]))
+ if (ieee80211_rx_reorder_ready(tid_agg_rx, j))
break;
}
@@ -987,7 +1007,7 @@ static bool ieee80211_sta_manage_reorder_buf(struct ieee80211_sub_if_data *sdata
index = mpdu_seq_num % tid_agg_rx->buf_size;
/* check if we already stored this frame */
- if (ieee80211_rx_reorder_ready(&tid_agg_rx->reorder_buf[index])) {
+ if (ieee80211_rx_reorder_ready(tid_agg_rx, index)) {
dev_kfree_skb(skb);
goto out;
}
@@ -1100,6 +1120,9 @@ ieee80211_rx_h_check_dup(struct ieee80211_rx_data *rx)
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)rx->skb->data;
struct ieee80211_rx_status *status = IEEE80211_SKB_RXCB(rx->skb);
+ if (status->flag & RX_FLAG_DUP_VALIDATED)
+ return RX_CONTINUE;
+
/*
* Drop duplicate 802.11 retransmissions
* (IEEE 802.11-2012: 9.3.2.10 "Duplicate detection and recovery")
@@ -1754,7 +1777,7 @@ ieee80211_reassemble_add(struct ieee80211_sub_if_data *sdata,
entry->seq = seq;
entry->rx_queue = rx_queue;
entry->last_frag = frag;
- entry->ccmp = 0;
+ entry->check_sequential_pn = false;
entry->extra_len = 0;
return entry;
@@ -1850,15 +1873,27 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
rx->seqno_idx, &(rx->skb));
if (rx->key &&
(rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP ||
- rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256) &&
+ rx->key->conf.cipher == WLAN_CIPHER_SUITE_CCMP_256 ||
+ rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP ||
+ rx->key->conf.cipher == WLAN_CIPHER_SUITE_GCMP_256) &&
ieee80211_has_protected(fc)) {
int queue = rx->security_idx;
- /* Store CCMP PN so that we can verify that the next
- * fragment has a sequential PN value. */
- entry->ccmp = 1;
+
+ /* Store CCMP/GCMP PN so that we can verify that the
+ * next fragment has a sequential PN value.
+ */
+ entry->check_sequential_pn = true;
memcpy(entry->last_pn,
rx->key->u.ccmp.rx_pn[queue],
IEEE80211_CCMP_PN_LEN);
+ BUILD_BUG_ON(offsetof(struct ieee80211_key,
+ u.ccmp.rx_pn) !=
+ offsetof(struct ieee80211_key,
+ u.gcmp.rx_pn));
+ BUILD_BUG_ON(sizeof(rx->key->u.ccmp.rx_pn[queue]) !=
+ sizeof(rx->key->u.gcmp.rx_pn[queue]));
+ BUILD_BUG_ON(IEEE80211_CCMP_PN_LEN !=
+ IEEE80211_GCMP_PN_LEN);
}
return RX_QUEUED;
}
@@ -1873,15 +1908,21 @@ ieee80211_rx_h_defragment(struct ieee80211_rx_data *rx)
return RX_DROP_MONITOR;
}
- /* Verify that MPDUs within one MSDU have sequential PN values.
- * (IEEE 802.11i, 8.3.3.4.5) */
- if (entry->ccmp) {
+ /* "The receiver shall discard MSDUs and MMPDUs whose constituent
+ * MPDU PN values are not incrementing in steps of 1."
+ * see IEEE P802.11-REVmc/D5.0, 12.5.3.4.4, item d (for CCMP)
+ * and IEEE P802.11-REVmc/D5.0, 12.5.5.4.4, item d (for GCMP)
+ */
+ if (entry->check_sequential_pn) {
int i;
u8 pn[IEEE80211_CCMP_PN_LEN], *rpn;
int queue;
+
if (!rx->key ||
(rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP &&
- rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256))
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_CCMP_256 &&
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP &&
+ rx->key->conf.cipher != WLAN_CIPHER_SUITE_GCMP_256))
return RX_DROP_UNUSABLE;
memcpy(pn, entry->last_pn, IEEE80211_CCMP_PN_LEN);
for (i = IEEE80211_CCMP_PN_LEN - 1; i >= 0; i--) {
@@ -2200,9 +2241,6 @@ ieee80211_rx_h_amsdu(struct ieee80211_rx_data *rx)
skb->dev = dev;
__skb_queue_head_init(&frame_list);
- if (skb_linearize(skb))
- return RX_DROP_UNUSABLE;
-
ieee80211_amsdu_to_8023s(skb, &frame_list, dev->dev_addr,
rx->sdata->vif.type,
rx->local->hw.extra_tx_headroom, true);
@@ -2232,7 +2270,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
struct ieee80211_local *local = rx->local;
struct ieee80211_sub_if_data *sdata = rx->sdata;
struct ieee80211_if_mesh *ifmsh = &sdata->u.mesh;
- u16 q, hdrlen;
+ u16 ac, q, hdrlen;
hdr = (struct ieee80211_hdr *) skb->data;
hdrlen = ieee80211_hdrlen(hdr->frame_control);
@@ -2291,6 +2329,7 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
spin_lock_bh(&mppath->state_lock);
if (!ether_addr_equal(mppath->mpp, mpp_addr))
memcpy(mppath->mpp, mpp_addr, ETH_ALEN);
+ mppath->exp_time = jiffies;
spin_unlock_bh(&mppath->state_lock);
}
rcu_read_unlock();
@@ -2301,7 +2340,8 @@ ieee80211_rx_h_mesh_fwding(struct ieee80211_rx_data *rx)
ether_addr_equal(sdata->vif.addr, hdr->addr3))
return RX_CONTINUE;
- q = ieee80211_select_queue_80211(sdata, skb, hdr);
+ ac = ieee80211_select_queue_80211(sdata, skb, hdr);
+ q = sdata->vif.hw_queue[ac];
if (ieee80211_queue_stopped(&local->hw, q)) {
IEEE80211_IFSTA_MESH_CTR_INC(ifmsh, dropped_frames_congestion);
return RX_DROP_MONITOR;
@@ -2736,10 +2776,14 @@ ieee80211_rx_h_action(struct ieee80211_rx_data *rx)
opmode = mgmt->u.action.u.vht_opmode_notif.operating_mode;
ieee80211_vht_handle_opmode(rx->sdata, rx->sta,
- opmode, status->band,
- false);
+ opmode, status->band);
goto handled;
}
+ case WLAN_VHT_ACTION_GROUPID_MGMT: {
+ if (len < IEEE80211_MIN_ACTION_SIZE + 25)
+ goto invalid;
+ goto queue;
+ }
default:
break;
}
@@ -3075,7 +3119,7 @@ static void ieee80211_rx_cooked_monitor(struct ieee80211_rx_data *rx,
ieee80211_add_rx_radiotap_header(local, skb, rate, needed_headroom,
false);
- skb_set_mac_header(skb, 0);
+ skb_reset_mac_header(skb);
skb->ip_summed = CHECKSUM_UNNECESSARY;
skb->pkt_type = PACKET_OTHERHOST;
skb->protocol = htons(ETH_P_802_2);
@@ -3277,6 +3321,85 @@ void ieee80211_release_reorder_timeout(struct sta_info *sta, int tid)
ieee80211_rx_handlers(&rx, &frames);
}
+void ieee80211_mark_rx_ba_filtered_frames(struct ieee80211_sta *pubsta, u8 tid,
+ u16 ssn, u64 filtered,
+ u16 received_mpdus)
+{
+ struct sta_info *sta;
+ struct tid_ampdu_rx *tid_agg_rx;
+ struct sk_buff_head frames;
+ struct ieee80211_rx_data rx = {
+ /* This is OK -- must be QoS data frame */
+ .security_idx = tid,
+ .seqno_idx = tid,
+ };
+ int i, diff;
+
+ if (WARN_ON(!pubsta || tid >= IEEE80211_NUM_TIDS))
+ return;
+
+ __skb_queue_head_init(&frames);
+
+ sta = container_of(pubsta, struct sta_info, sta);
+
+ rx.sta = sta;
+ rx.sdata = sta->sdata;
+ rx.local = sta->local;
+
+ rcu_read_lock();
+ tid_agg_rx = rcu_dereference(sta->ampdu_mlme.tid_rx[tid]);
+ if (!tid_agg_rx)
+ goto out;
+
+ spin_lock_bh(&tid_agg_rx->reorder_lock);
+
+ if (received_mpdus >= IEEE80211_SN_MODULO >> 1) {
+ int release;
+
+ /* release all frames in the reorder buffer */
+ release = (tid_agg_rx->head_seq_num + tid_agg_rx->buf_size) %
+ IEEE80211_SN_MODULO;
+ ieee80211_release_reorder_frames(sta->sdata, tid_agg_rx,
+ release, &frames);
+ /* update ssn to match received ssn */
+ tid_agg_rx->head_seq_num = ssn;
+ } else {
+ ieee80211_release_reorder_frames(sta->sdata, tid_agg_rx, ssn,
+ &frames);
+ }
+
+ /* handle the case that received ssn is behind the mac ssn.
+ * it can be tid_agg_rx->buf_size behind and still be valid */
+ diff = (tid_agg_rx->head_seq_num - ssn) & IEEE80211_SN_MASK;
+ if (diff >= tid_agg_rx->buf_size) {
+ tid_agg_rx->reorder_buf_filtered = 0;
+ goto release;
+ }
+ filtered = filtered >> diff;
+ ssn += diff;
+
+ /* update bitmap */
+ for (i = 0; i < tid_agg_rx->buf_size; i++) {
+ int index = (ssn + i) % tid_agg_rx->buf_size;
+
+ tid_agg_rx->reorder_buf_filtered &= ~BIT_ULL(index);
+ if (filtered & BIT_ULL(i))
+ tid_agg_rx->reorder_buf_filtered |= BIT_ULL(index);
+ }
+
+ /* now process also frames that the filter marking released */
+ ieee80211_sta_reorder_release(sta->sdata, tid_agg_rx, &frames);
+
+release:
+ spin_unlock_bh(&tid_agg_rx->reorder_lock);
+
+ ieee80211_rx_handlers(&rx, &frames);
+
+ out:
+ rcu_read_unlock();
+}
+EXPORT_SYMBOL(ieee80211_mark_rx_ba_filtered_frames);
+
/* main receive path */
static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
@@ -3368,6 +3491,7 @@ static bool ieee80211_accept_frame(struct ieee80211_rx_data *rx)
return false;
/* ignore action frames to TDLS-peers */
if (ieee80211_is_action(hdr->frame_control) &&
+ !is_broadcast_ether_addr(bssid) &&
!ether_addr_equal(bssid, hdr->addr1))
return false;
}
diff --git a/net/mac80211/scan.c b/net/mac80211/scan.c
index 4aeca4b0c3cb..ae980ce8daff 100644
--- a/net/mac80211/scan.c
+++ b/net/mac80211/scan.c
@@ -314,6 +314,7 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
bool was_scanning = local->scanning;
struct cfg80211_scan_request *scan_req;
struct ieee80211_sub_if_data *scan_sdata;
+ struct ieee80211_sub_if_data *sdata;
lockdep_assert_held(&local->mtx);
@@ -373,7 +374,16 @@ static void __ieee80211_scan_completed(struct ieee80211_hw *hw, bool aborted)
ieee80211_mlme_notify_scan_completed(local);
ieee80211_ibss_notify_scan_completed(local);
- ieee80211_mesh_notify_scan_completed(local);
+
+ /* Requeue all the work that might have been ignored while
+ * the scan was in progress; if there was none this will
+ * just be a no-op for the particular interface.
+ */
+ list_for_each_entry_rcu(sdata, &local->interfaces, list) {
+ if (ieee80211_sdata_running(sdata))
+ ieee80211_queue_work(&sdata->local->hw, &sdata->work);
+ }
+
if (was_scanning)
ieee80211_start_next_roc(local);
}
@@ -597,8 +607,8 @@ static int __ieee80211_start_scan(struct ieee80211_sub_if_data *sdata,
/* We need to ensure power level is at max for scanning. */
ieee80211_hw_config(local, 0);
- if ((req->channels[0]->flags &
- IEEE80211_CHAN_NO_IR) ||
+ if ((req->channels[0]->flags & (IEEE80211_CHAN_NO_IR |
+ IEEE80211_CHAN_RADAR)) ||
!req->n_ssids) {
next_delay = IEEE80211_PASSIVE_CHANNEL_TIME;
} else {
@@ -645,7 +655,7 @@ ieee80211_scan_get_channel_time(struct ieee80211_channel *chan)
* TODO: channel switching also consumes quite some time,
* add that delay as well to get a better estimation
*/
- if (chan->flags & IEEE80211_CHAN_NO_IR)
+ if (chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR))
return IEEE80211_PASSIVE_CHANNEL_TIME;
return IEEE80211_PROBE_DELAY + IEEE80211_CHANNEL_TIME;
}
@@ -777,7 +787,8 @@ static void ieee80211_scan_state_set_channel(struct ieee80211_local *local,
*
* In any case, it is not necessary for a passive scan.
*/
- if (chan->flags & IEEE80211_CHAN_NO_IR || !scan_req->n_ssids) {
+ if ((chan->flags & (IEEE80211_CHAN_NO_IR | IEEE80211_CHAN_RADAR)) ||
+ !scan_req->n_ssids) {
*next_delay = IEEE80211_PASSIVE_CHANNEL_TIME;
local->next_scan_state = SCAN_DECISION;
return;
@@ -1212,6 +1223,14 @@ void ieee80211_sched_scan_stopped(struct ieee80211_hw *hw)
trace_api_sched_scan_stopped(local);
+ /*
+ * this shouldn't really happen, so for simplicity
+ * simply ignore it, and let mac80211 reconfigure
+ * the sched scan later on.
+ */
+ if (local->in_reconfig)
+ return;
+
schedule_work(&local->sched_scan_stopped_work);
}
EXPORT_SYMBOL(ieee80211_sched_scan_stopped);
diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
index f91d1873218c..861b93ffbe92 100644
--- a/net/mac80211/sta_info.c
+++ b/net/mac80211/sta_info.c
@@ -2,6 +2,7 @@
* Copyright 2002-2005, Instant802 Networks, Inc.
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright (C) 2015 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -66,6 +67,7 @@
static const struct rhashtable_params sta_rht_params = {
.nelem_hint = 3, /* start small */
+ .insecure_elasticity = true, /* Disable chain-length checks. */
.automatic_shrinking = true,
.head_offset = offsetof(struct sta_info, hash_node),
.key_offset = offsetof(struct sta_info, addr),
@@ -115,6 +117,7 @@ static void __cleanup_single_sta(struct sta_info *sta)
ieee80211_purge_tx_queue(&local->hw, &txqi->queue);
atomic_sub(n, &sdata->txqs_len[txqi->txq.ac]);
+ txqi->byte_cnt = 0;
}
}
@@ -256,11 +259,11 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
}
/* Caller must hold local->sta_mtx */
-static void sta_info_hash_add(struct ieee80211_local *local,
- struct sta_info *sta)
+static int sta_info_hash_add(struct ieee80211_local *local,
+ struct sta_info *sta)
{
- rhashtable_insert_fast(&local->sta_hash, &sta->hash_node,
- sta_rht_params);
+ return rhashtable_insert_fast(&local->sta_hash, &sta->hash_node,
+ sta_rht_params);
}
static void sta_deliver_ps_frames(struct work_struct *wk)
@@ -435,6 +438,19 @@ static int sta_info_insert_check(struct sta_info *sta)
is_multicast_ether_addr(sta->sta.addr)))
return -EINVAL;
+ /* Strictly speaking this isn't necessary as we hold the mutex, but
+ * the rhashtable code can't really deal with that distinction. We
+ * do require the mutex for correctness though.
+ */
+ rcu_read_lock();
+ lockdep_assert_held(&sdata->local->sta_mtx);
+ if (ieee80211_hw_check(&sdata->local->hw, NEEDS_UNIQUE_STA_ADDR) &&
+ ieee80211_find_sta_by_ifaddr(&sdata->local->hw, sta->addr, NULL)) {
+ rcu_read_unlock();
+ return -ENOTUNIQ;
+ }
+ rcu_read_unlock();
+
return 0;
}
@@ -484,11 +500,17 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
{
struct ieee80211_local *local = sta->local;
struct ieee80211_sub_if_data *sdata = sta->sdata;
- struct station_info sinfo;
+ struct station_info *sinfo;
int err = 0;
lockdep_assert_held(&local->sta_mtx);
+ sinfo = kzalloc(sizeof(struct station_info), GFP_KERNEL);
+ if (!sinfo) {
+ err = -ENOMEM;
+ goto out_err;
+ }
+
/* check if STA exists already */
if (sta_info_get_bss(sdata, sta->sta.addr)) {
err = -EEXIST;
@@ -503,7 +525,9 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
set_sta_flag(sta, WLAN_STA_BLOCK_BA);
/* make the station visible */
- sta_info_hash_add(local, sta);
+ err = sta_info_hash_add(local, sta);
+ if (err)
+ goto out_drop_sta;
list_add_tail_rcu(&sta->list, &local->sta_list);
@@ -516,14 +540,12 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
/* accept BA sessions now */
clear_sta_flag(sta, WLAN_STA_BLOCK_BA);
- ieee80211_recalc_min_chandef(sdata);
ieee80211_sta_debugfs_add(sta);
rate_control_add_sta_debugfs(sta);
- memset(&sinfo, 0, sizeof(sinfo));
- sinfo.filled = 0;
- sinfo.generation = local->sta_generation;
- cfg80211_new_sta(sdata->dev, sta->sta.addr, &sinfo, GFP_KERNEL);
+ sinfo->generation = local->sta_generation;
+ cfg80211_new_sta(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL);
+ kfree(sinfo);
sta_dbg(sdata, "Inserted STA %pM\n", sta->sta.addr);
@@ -538,11 +560,13 @@ static int sta_info_insert_finish(struct sta_info *sta) __acquires(RCU)
out_remove:
sta_info_hash_del(local, sta);
list_del_rcu(&sta->list);
+ out_drop_sta:
local->num_sta--;
synchronize_net();
__cleanup_single_sta(sta);
out_err:
mutex_unlock(&local->sta_mtx);
+ kfree(sinfo);
rcu_read_lock();
return err;
}
@@ -554,14 +578,15 @@ int sta_info_insert_rcu(struct sta_info *sta) __acquires(RCU)
might_sleep();
+ mutex_lock(&local->sta_mtx);
+
err = sta_info_insert_check(sta);
if (err) {
+ mutex_unlock(&local->sta_mtx);
rcu_read_lock();
goto out_free;
}
- mutex_lock(&local->sta_mtx);
-
err = sta_info_insert_finish(sta);
if (err)
goto out_free;
@@ -868,6 +893,7 @@ static int __must_check __sta_info_destroy_part1(struct sta_info *sta)
}
list_del_rcu(&sta->list);
+ sta->removed = true;
drv_sta_pre_rcu_remove(local, sta->sdata, sta);
@@ -882,7 +908,7 @@ static void __sta_info_destroy_part2(struct sta_info *sta)
{
struct ieee80211_local *local = sta->local;
struct ieee80211_sub_if_data *sdata = sta->sdata;
- struct station_info sinfo = {};
+ struct station_info *sinfo;
int ret;
/*
@@ -920,12 +946,14 @@ static void __sta_info_destroy_part2(struct sta_info *sta)
sta_dbg(sdata, "Removed STA %pM\n", sta->sta.addr);
- sta_set_sinfo(sta, &sinfo);
- cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, &sinfo, GFP_KERNEL);
+ sinfo = kzalloc(sizeof(*sinfo), GFP_KERNEL);
+ if (sinfo)
+ sta_set_sinfo(sta, sinfo);
+ cfg80211_del_sta_sinfo(sdata->dev, sta->sta.addr, sinfo, GFP_KERNEL);
+ kfree(sinfo);
rate_control_remove_sta_debugfs(sta);
ieee80211_sta_debugfs_remove(sta);
- ieee80211_recalc_min_chandef(sdata);
cleanup_single_sta(sta);
}
@@ -1230,11 +1258,11 @@ void ieee80211_sta_ps_deliver_wakeup(struct sta_info *sta)
ieee80211_check_fast_xmit(sta);
}
-static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata,
- struct sta_info *sta, int tid,
+static void ieee80211_send_null_response(struct sta_info *sta, int tid,
enum ieee80211_frame_release_type reason,
- bool call_driver)
+ bool call_driver, bool more_data)
{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
struct ieee80211_local *local = sdata->local;
struct ieee80211_qos_hdr *nullfunc;
struct sk_buff *skb;
@@ -1274,9 +1302,13 @@ static void ieee80211_send_null_response(struct ieee80211_sub_if_data *sdata,
if (qos) {
nullfunc->qos_ctrl = cpu_to_le16(tid);
- if (reason == IEEE80211_FRAME_RELEASE_UAPSD)
+ if (reason == IEEE80211_FRAME_RELEASE_UAPSD) {
nullfunc->qos_ctrl |=
cpu_to_le16(IEEE80211_QOS_CTL_EOSP);
+ if (more_data)
+ nullfunc->frame_control |=
+ cpu_to_le16(IEEE80211_FCTL_MOREDATA);
+ }
}
info = IEEE80211_SKB_CB(skb);
@@ -1323,22 +1355,48 @@ static int find_highest_prio_tid(unsigned long tids)
return fls(tids) - 1;
}
+/* Indicates if the MORE_DATA bit should be set in the last
+ * frame obtained by ieee80211_sta_ps_get_frames.
+ * Note that driver_release_tids is relevant only if
+ * reason = IEEE80211_FRAME_RELEASE_PSPOLL
+ */
+static bool
+ieee80211_sta_ps_more_data(struct sta_info *sta, u8 ignored_acs,
+ enum ieee80211_frame_release_type reason,
+ unsigned long driver_release_tids)
+{
+ int ac;
+
+ /* If the driver has data on more than one TID then
+ * certainly there's more data if we release just a
+ * single frame now (from a single TID). This will
+ * only happen for PS-Poll.
+ */
+ if (reason == IEEE80211_FRAME_RELEASE_PSPOLL &&
+ hweight16(driver_release_tids) > 1)
+ return true;
+
+ for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
+ if (ignored_acs & BIT(ac))
+ continue;
+
+ if (!skb_queue_empty(&sta->tx_filtered[ac]) ||
+ !skb_queue_empty(&sta->ps_tx_buf[ac]))
+ return true;
+ }
+
+ return false;
+}
+
static void
-ieee80211_sta_ps_deliver_response(struct sta_info *sta,
- int n_frames, u8 ignored_acs,
- enum ieee80211_frame_release_type reason)
+ieee80211_sta_ps_get_frames(struct sta_info *sta, int n_frames, u8 ignored_acs,
+ enum ieee80211_frame_release_type reason,
+ struct sk_buff_head *frames,
+ unsigned long *driver_release_tids)
{
struct ieee80211_sub_if_data *sdata = sta->sdata;
struct ieee80211_local *local = sdata->local;
- bool more_data = false;
int ac;
- unsigned long driver_release_tids = 0;
- struct sk_buff_head frames;
-
- /* Service or PS-Poll period starts */
- set_sta_flag(sta, WLAN_STA_SP);
-
- __skb_queue_head_init(&frames);
/* Get response frame(s) and more data bit for the last one. */
for (ac = 0; ac < IEEE80211_NUM_ACS; ac++) {
@@ -1352,26 +1410,13 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
/* if we already have frames from software, then we can't also
* release from hardware queues
*/
- if (skb_queue_empty(&frames)) {
- driver_release_tids |= sta->driver_buffered_tids & tids;
- driver_release_tids |= sta->txq_buffered_tids & tids;
+ if (skb_queue_empty(frames)) {
+ *driver_release_tids |=
+ sta->driver_buffered_tids & tids;
+ *driver_release_tids |= sta->txq_buffered_tids & tids;
}
- if (driver_release_tids) {
- /* If the driver has data on more than one TID then
- * certainly there's more data if we release just a
- * single frame now (from a single TID). This will
- * only happen for PS-Poll.
- */
- if (reason == IEEE80211_FRAME_RELEASE_PSPOLL &&
- hweight16(driver_release_tids) > 1) {
- more_data = true;
- driver_release_tids =
- BIT(find_highest_prio_tid(
- driver_release_tids));
- break;
- }
- } else {
+ if (!*driver_release_tids) {
struct sk_buff *skb;
while (n_frames > 0) {
@@ -1385,20 +1430,44 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
if (!skb)
break;
n_frames--;
- __skb_queue_tail(&frames, skb);
+ __skb_queue_tail(frames, skb);
}
}
- /* If we have more frames buffered on this AC, then set the
- * more-data bit and abort the loop since we can't send more
- * data from other ACs before the buffered frames from this.
+ /* If we have more frames buffered on this AC, then abort the
+ * loop since we can't send more data from other ACs before
+ * the buffered frames from this.
*/
if (!skb_queue_empty(&sta->tx_filtered[ac]) ||
- !skb_queue_empty(&sta->ps_tx_buf[ac])) {
- more_data = true;
+ !skb_queue_empty(&sta->ps_tx_buf[ac]))
break;
- }
}
+}
+
+static void
+ieee80211_sta_ps_deliver_response(struct sta_info *sta,
+ int n_frames, u8 ignored_acs,
+ enum ieee80211_frame_release_type reason)
+{
+ struct ieee80211_sub_if_data *sdata = sta->sdata;
+ struct ieee80211_local *local = sdata->local;
+ unsigned long driver_release_tids = 0;
+ struct sk_buff_head frames;
+ bool more_data;
+
+ /* Service or PS-Poll period starts */
+ set_sta_flag(sta, WLAN_STA_SP);
+
+ __skb_queue_head_init(&frames);
+
+ ieee80211_sta_ps_get_frames(sta, n_frames, ignored_acs, reason,
+ &frames, &driver_release_tids);
+
+ more_data = ieee80211_sta_ps_more_data(sta, ignored_acs, reason, driver_release_tids);
+
+ if (driver_release_tids && reason == IEEE80211_FRAME_RELEASE_PSPOLL)
+ driver_release_tids =
+ BIT(find_highest_prio_tid(driver_release_tids));
if (skb_queue_empty(&frames) && !driver_release_tids) {
int tid;
@@ -1421,7 +1490,7 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
/* This will evaluate to 1, 3, 5 or 7. */
tid = 7 - ((ffs(~ignored_acs) - 1) << 1);
- ieee80211_send_null_response(sdata, sta, tid, reason, true);
+ ieee80211_send_null_response(sta, tid, reason, true, false);
} else if (!driver_release_tids) {
struct sk_buff_head pending;
struct sk_buff *skb;
@@ -1521,8 +1590,8 @@ ieee80211_sta_ps_deliver_response(struct sta_info *sta,
if (need_null)
ieee80211_send_null_response(
- sdata, sta, find_highest_prio_tid(tids),
- reason, false);
+ sta, find_highest_prio_tid(tids),
+ reason, false, false);
sta_info_recalc_tim(sta);
} else {
@@ -1660,6 +1729,22 @@ void ieee80211_sta_eosp(struct ieee80211_sta *pubsta)
}
EXPORT_SYMBOL(ieee80211_sta_eosp);
+void ieee80211_send_eosp_nullfunc(struct ieee80211_sta *pubsta, int tid)
+{
+ struct sta_info *sta = container_of(pubsta, struct sta_info, sta);
+ enum ieee80211_frame_release_type reason;
+ bool more_data;
+
+ trace_api_send_eosp_nullfunc(sta->local, pubsta, tid);
+
+ reason = IEEE80211_FRAME_RELEASE_UAPSD;
+ more_data = ieee80211_sta_ps_more_data(sta, ~sta->sta.uapsd_queues,
+ reason, 0);
+
+ ieee80211_send_null_response(sta, tid, reason, false, more_data);
+}
+EXPORT_SYMBOL(ieee80211_send_eosp_nullfunc);
+
void ieee80211_sta_set_buffered(struct ieee80211_sta *pubsta,
u8 tid, bool buffered)
{
@@ -1735,14 +1820,17 @@ int sta_info_move_state(struct sta_info *sta,
clear_bit(WLAN_STA_AUTH, &sta->_flags);
break;
case IEEE80211_STA_AUTH:
- if (sta->sta_state == IEEE80211_STA_NONE)
+ if (sta->sta_state == IEEE80211_STA_NONE) {
set_bit(WLAN_STA_AUTH, &sta->_flags);
- else if (sta->sta_state == IEEE80211_STA_ASSOC)
+ } else if (sta->sta_state == IEEE80211_STA_ASSOC) {
clear_bit(WLAN_STA_ASSOC, &sta->_flags);
+ ieee80211_recalc_min_chandef(sta->sdata);
+ }
break;
case IEEE80211_STA_ASSOC:
if (sta->sta_state == IEEE80211_STA_AUTH) {
set_bit(WLAN_STA_ASSOC, &sta->_flags);
+ ieee80211_recalc_min_chandef(sta->sdata);
} else if (sta->sta_state == IEEE80211_STA_AUTHORIZED) {
if (sta->sdata->vif.type == NL80211_IFTYPE_AP ||
(sta->sdata->vif.type == NL80211_IFTYPE_AP_VLAN &&
diff --git a/net/mac80211/sta_info.h b/net/mac80211/sta_info.h
index 2cafb21b422f..62193f4bc37b 100644
--- a/net/mac80211/sta_info.h
+++ b/net/mac80211/sta_info.h
@@ -1,6 +1,7 @@
/*
* Copyright 2002-2005, Devicescape Software, Inc.
* Copyright 2013-2014 Intel Mobile Communications GmbH
+ * Copyright(c) 2015 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -167,6 +168,8 @@ struct tid_ampdu_tx {
*
* @reorder_buf: buffer to reorder incoming aggregated MPDUs. An MPDU may be an
* A-MSDU with individually reported subframes.
+ * @reorder_buf_filtered: bitmap indicating where there are filtered frames in
+ * the reorder buffer that should be ignored when releasing frames
* @reorder_time: jiffies when skb was added
* @session_timer: check if peer keeps Tx-ing on the TID (by timeout value)
* @reorder_timer: releases expired frames from the reorder buffer.
@@ -194,6 +197,7 @@ struct tid_ampdu_tx {
struct tid_ampdu_rx {
struct rcu_head rcu_head;
spinlock_t reorder_lock;
+ u64 reorder_buf_filtered;
struct sk_buff_head *reorder_buf;
unsigned long *reorder_time;
struct timer_list session_timer;
@@ -212,20 +216,21 @@ struct tid_ampdu_rx {
/**
* struct sta_ampdu_mlme - STA aggregation information.
*
+ * @mtx: mutex to protect all TX data (except non-NULL assignments
+ * to tid_tx[idx], which are protected by the sta spinlock)
+ * tid_start_tx is also protected by sta->lock.
* @tid_rx: aggregation info for Rx per TID -- RCU protected
- * @tid_tx: aggregation info for Tx per TID
- * @tid_start_tx: sessions where start was requested
- * @addba_req_num: number of times addBA request has been sent.
- * @last_addba_req_time: timestamp of the last addBA request.
- * @dialog_token_allocator: dialog token enumerator for each new session;
- * @work: work struct for starting/stopping aggregation
* @tid_rx_timer_expired: bitmap indicating on which TIDs the
* RX timer expired until the work for it runs
* @tid_rx_stop_requested: bitmap indicating which BA sessions per TID the
* driver requested to close until the work for it runs
- * @mtx: mutex to protect all TX data (except non-NULL assignments
- * to tid_tx[idx], which are protected by the sta spinlock)
- * tid_start_tx is also protected by sta->lock.
+ * @agg_session_valid: bitmap indicating which TID has a rx BA session open on
+ * @work: work struct for starting/stopping aggregation
+ * @tid_tx: aggregation info for Tx per TID
+ * @tid_start_tx: sessions where start was requested
+ * @last_addba_req_time: timestamp of the last addBA request.
+ * @addba_req_num: number of times addBA request has been sent.
+ * @dialog_token_allocator: dialog token enumerator for each new session;
*/
struct sta_ampdu_mlme {
struct mutex mtx;
@@ -233,6 +238,7 @@ struct sta_ampdu_mlme {
struct tid_ampdu_rx __rcu *tid_rx[IEEE80211_NUM_TIDS];
unsigned long tid_rx_timer_expired[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
unsigned long tid_rx_stop_requested[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
+ unsigned long agg_session_valid[BITS_TO_LONGS(IEEE80211_NUM_TIDS)];
/* tx */
struct work_struct work;
struct tid_ampdu_tx __rcu *tid_tx[IEEE80211_NUM_TIDS];
@@ -367,10 +373,10 @@ DECLARE_EWMA(signal, 1024, 8)
* @mesh: mesh STA information
* @debugfs: debug filesystem info
* @dead: set to true when sta is unlinked
+ * @removed: set to true when sta is being removed from sta_list
* @uploaded: set to true when sta is uploaded to the driver
* @sta: station information we share with the driver
* @sta_state: duplicates information about station state (for debug)
- * @beacon_loss_count: number of times beacon loss has triggered
* @rcu_head: RCU head used for freeing this station struct
* @cur_max_bandwidth: maximum bandwidth to use for TX to the station,
* taken from HT/VHT capabilities or VHT operating mode notification
@@ -412,6 +418,7 @@ struct sta_info {
u16 listen_interval;
bool dead;
+ bool removed;
bool uploaded;
diff --git a/net/mac80211/status.c b/net/mac80211/status.c
index 5bad05e9af90..8b1b2ea03eb5 100644
--- a/net/mac80211/status.c
+++ b/net/mac80211/status.c
@@ -51,6 +51,11 @@ static void ieee80211_handle_filtered_frame(struct ieee80211_local *local,
struct ieee80211_hdr *hdr = (void *)skb->data;
int ac;
+ if (info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER) {
+ ieee80211_free_txskb(&local->hw, skb);
+ return;
+ }
+
/*
* This skb 'survived' a round-trip through the driver, and
* hopefully the driver didn't mangle it too badly. However,
@@ -692,7 +697,7 @@ void ieee80211_tx_monitor(struct ieee80211_local *local, struct sk_buff *skb,
rtap_len, shift);
/* XXX: is this sufficient for BPF? */
- skb_set_mac_header(skb, 0);
+ skb_reset_mac_header(skb);
skb->ip_summed = CHECKSUM_UNNECESSARY;
skb->pkt_type = PACKET_OTHERHOST;
skb->protocol = htons(ETH_P_802_2);
diff --git a/net/mac80211/tdls.c b/net/mac80211/tdls.c
index c9eeb3f12808..a29ea813b7d5 100644
--- a/net/mac80211/tdls.c
+++ b/net/mac80211/tdls.c
@@ -4,7 +4,7 @@
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2014, Intel Corporation
* Copyright 2014 Intel Mobile Communications GmbH
- * Copyright 2015 Intel Deutschland GmbH
+ * Copyright 2015 - 2016 Intel Deutschland GmbH
*
* This file is GPLv2 as found in COPYING.
*/
@@ -15,6 +15,7 @@
#include <linux/rtnetlink.h>
#include "ieee80211_i.h"
#include "driver-ops.h"
+#include "rate.h"
/* give usermode some time for retries in setting up the TDLS session */
#define TDLS_PEER_SETUP_TIMEOUT (15 * HZ)
@@ -302,7 +303,7 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata,
/* IEEE802.11ac-2013 Table E-4 */
u16 centers_80mhz[] = { 5210, 5290, 5530, 5610, 5690, 5775 };
struct cfg80211_chan_def uc = sta->tdls_chandef;
- enum nl80211_chan_width max_width = ieee80211_get_sta_bw(&sta->sta);
+ enum nl80211_chan_width max_width = ieee80211_sta_cap_chan_bw(sta);
int i;
/* only support upgrading non-narrow channels up to 80Mhz */
@@ -313,7 +314,7 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata,
if (max_width > NL80211_CHAN_WIDTH_80)
max_width = NL80211_CHAN_WIDTH_80;
- if (uc.width == max_width)
+ if (uc.width >= max_width)
return;
/*
* Channel usage constrains in the IEEE802.11ac-2013 specification only
@@ -324,6 +325,7 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata,
for (i = 0; i < ARRAY_SIZE(centers_80mhz); i++)
if (abs(uc.chan->center_freq - centers_80mhz[i]) <= 30) {
uc.center_freq1 = centers_80mhz[i];
+ uc.center_freq2 = 0;
uc.width = NL80211_CHAN_WIDTH_80;
break;
}
@@ -332,7 +334,7 @@ ieee80211_tdls_chandef_vht_upgrade(struct ieee80211_sub_if_data *sdata,
return;
/* proceed to downgrade the chandef until usable or the same */
- while (uc.width > max_width &&
+ while (uc.width > max_width ||
!cfg80211_reg_can_beacon_relax(sdata->local->hw.wiphy, &uc,
sdata->wdev.iftype))
ieee80211_chandef_downgrade(&uc);
@@ -1242,18 +1244,44 @@ int ieee80211_tdls_mgmt(struct wiphy *wiphy, struct net_device *dev,
return ret;
}
-static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata)
+static void iee80211_tdls_recalc_chanctx(struct ieee80211_sub_if_data *sdata,
+ struct sta_info *sta)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_chanctx_conf *conf;
struct ieee80211_chanctx *ctx;
+ enum nl80211_chan_width width;
+ struct ieee80211_supported_band *sband;
mutex_lock(&local->chanctx_mtx);
conf = rcu_dereference_protected(sdata->vif.chanctx_conf,
lockdep_is_held(&local->chanctx_mtx));
if (conf) {
+ width = conf->def.width;
+ sband = local->hw.wiphy->bands[conf->def.chan->band];
ctx = container_of(conf, struct ieee80211_chanctx, conf);
ieee80211_recalc_chanctx_chantype(local, ctx);
+
+ /* if width changed and a peer is given, update its BW */
+ if (width != conf->def.width && sta &&
+ test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW)) {
+ enum ieee80211_sta_rx_bandwidth bw;
+
+ bw = ieee80211_chan_width_to_rx_bw(conf->def.width);
+ bw = min(bw, ieee80211_sta_cap_rx_bw(sta));
+ if (bw != sta->sta.bandwidth) {
+ sta->sta.bandwidth = bw;
+ rate_control_rate_update(local, sband, sta,
+ IEEE80211_RC_BW_CHANGED);
+ /*
+ * if a TDLS peer BW was updated, we need to
+ * recalc the chandef width again, to get the
+ * correct chanctx min_def
+ */
+ ieee80211_recalc_chanctx_chantype(local, ctx);
+ }
+ }
+
}
mutex_unlock(&local->chanctx_mtx);
}
@@ -1350,8 +1378,6 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
break;
}
- iee80211_tdls_recalc_chanctx(sdata);
-
mutex_lock(&local->sta_mtx);
sta = sta_info_get(sdata, peer);
if (!sta) {
@@ -1360,6 +1386,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
break;
}
+ iee80211_tdls_recalc_chanctx(sdata, sta);
iee80211_tdls_recalc_ht_protection(sdata, sta);
set_sta_flag(sta, WLAN_STA_TDLS_PEER_AUTH);
@@ -1390,7 +1417,7 @@ int ieee80211_tdls_oper(struct wiphy *wiphy, struct net_device *dev,
iee80211_tdls_recalc_ht_protection(sdata, NULL);
mutex_unlock(&local->sta_mtx);
- iee80211_tdls_recalc_chanctx(sdata);
+ iee80211_tdls_recalc_chanctx(sdata, NULL);
break;
default:
ret = -ENOTSUPP;
diff --git a/net/mac80211/tkip.c b/net/mac80211/tkip.c
index 0ae207771a58..b3622823bad2 100644
--- a/net/mac80211/tkip.c
+++ b/net/mac80211/tkip.c
@@ -1,6 +1,7 @@
/*
* Copyright 2002-2004, Instant802 Networks, Inc.
* Copyright 2005, Devicescape Software, Inc.
+ * Copyright (C) 2016 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -142,15 +143,14 @@ static void tkip_mixing_phase2(const u8 *tk, struct tkip_ctx *ctx,
/* Add TKIP IV and Ext. IV at @pos. @iv0, @iv1, and @iv2 are the first octets
* of the IV. Returns pointer to the octet following IVs (i.e., beginning of
* the packet payload). */
-u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key)
+u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key_conf *keyconf, u64 pn)
{
- lockdep_assert_held(&key->u.tkip.txlock);
-
- pos = write_tkip_iv(pos, key->u.tkip.tx.iv16);
- *pos++ = (key->conf.keyidx << 6) | (1 << 5) /* Ext IV */;
- put_unaligned_le32(key->u.tkip.tx.iv32, pos);
+ pos = write_tkip_iv(pos, TKIP_PN_TO_IV16(pn));
+ *pos++ = (keyconf->keyidx << 6) | (1 << 5) /* Ext IV */;
+ put_unaligned_le32(TKIP_PN_TO_IV32(pn), pos);
return pos + 4;
}
+EXPORT_SYMBOL_GPL(ieee80211_tkip_add_iv);
static void ieee80211_compute_tkip_p1k(struct ieee80211_key *key, u32 iv32)
{
@@ -250,6 +250,7 @@ int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm,
u8 rc4key[16], keyid, *pos = payload;
int res;
const u8 *tk = &key->conf.key[NL80211_TKIP_DATA_OFFSET_ENCR_KEY];
+ struct tkip_ctx_rx *rx_ctx = &key->u.tkip.rx[queue];
if (payload_len < 12)
return -1;
@@ -265,37 +266,36 @@ int ieee80211_tkip_decrypt_data(struct crypto_cipher *tfm,
if ((keyid >> 6) != key->conf.keyidx)
return TKIP_DECRYPT_INVALID_KEYIDX;
- if (key->u.tkip.rx[queue].state != TKIP_STATE_NOT_INIT &&
- (iv32 < key->u.tkip.rx[queue].iv32 ||
- (iv32 == key->u.tkip.rx[queue].iv32 &&
- iv16 <= key->u.tkip.rx[queue].iv16)))
+ if (rx_ctx->ctx.state != TKIP_STATE_NOT_INIT &&
+ (iv32 < rx_ctx->iv32 ||
+ (iv32 == rx_ctx->iv32 && iv16 <= rx_ctx->iv16)))
return TKIP_DECRYPT_REPLAY;
if (only_iv) {
res = TKIP_DECRYPT_OK;
- key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED;
+ rx_ctx->ctx.state = TKIP_STATE_PHASE1_HW_UPLOADED;
goto done;
}
- if (key->u.tkip.rx[queue].state == TKIP_STATE_NOT_INIT ||
- key->u.tkip.rx[queue].iv32 != iv32) {
+ if (rx_ctx->ctx.state == TKIP_STATE_NOT_INIT ||
+ rx_ctx->iv32 != iv32) {
/* IV16 wrapped around - perform TKIP phase 1 */
- tkip_mixing_phase1(tk, &key->u.tkip.rx[queue], ta, iv32);
+ tkip_mixing_phase1(tk, &rx_ctx->ctx, ta, iv32);
}
if (key->local->ops->update_tkip_key &&
key->flags & KEY_FLAG_UPLOADED_TO_HARDWARE &&
- key->u.tkip.rx[queue].state != TKIP_STATE_PHASE1_HW_UPLOADED) {
+ rx_ctx->ctx.state != TKIP_STATE_PHASE1_HW_UPLOADED) {
struct ieee80211_sub_if_data *sdata = key->sdata;
if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN)
sdata = container_of(key->sdata->bss,
struct ieee80211_sub_if_data, u.ap);
drv_update_tkip_key(key->local, sdata, &key->conf, key->sta,
- iv32, key->u.tkip.rx[queue].p1k);
- key->u.tkip.rx[queue].state = TKIP_STATE_PHASE1_HW_UPLOADED;
+ iv32, rx_ctx->ctx.p1k);
+ rx_ctx->ctx.state = TKIP_STATE_PHASE1_HW_UPLOADED;
}
- tkip_mixing_phase2(tk, &key->u.tkip.rx[queue], iv16, rc4key);
+ tkip_mixing_phase2(tk, &rx_ctx->ctx, iv16, rc4key);
res = ieee80211_wep_decrypt_data(tfm, rc4key, 16, pos, payload_len - 12);
done:
diff --git a/net/mac80211/tkip.h b/net/mac80211/tkip.h
index e3ecb659b90a..a1bcbfbefe7c 100644
--- a/net/mac80211/tkip.h
+++ b/net/mac80211/tkip.h
@@ -13,8 +13,6 @@
#include <linux/crypto.h>
#include "key.h"
-u8 *ieee80211_tkip_add_iv(u8 *pos, struct ieee80211_key *key);
-
int ieee80211_tkip_encrypt_data(struct crypto_cipher *tfm,
struct ieee80211_key *key,
struct sk_buff *skb,
diff --git a/net/mac80211/trace.h b/net/mac80211/trace.h
index 56c6d6cfa5a1..2b0a17ee907a 100644
--- a/net/mac80211/trace.h
+++ b/net/mac80211/trace.h
@@ -80,7 +80,23 @@
#define KEY_PR_FMT " cipher:0x%x, flags=%#x, keyidx=%d, hw_key_idx=%d"
#define KEY_PR_ARG __entry->cipher, __entry->flags, __entry->keyidx, __entry->hw_key_idx
-
+#define AMPDU_ACTION_ENTRY __field(enum ieee80211_ampdu_mlme_action, \
+ ieee80211_ampdu_mlme_action) \
+ STA_ENTRY \
+ __field(u16, tid) \
+ __field(u16, ssn) \
+ __field(u8, buf_size) \
+ __field(bool, amsdu) \
+ __field(u16, timeout)
+#define AMPDU_ACTION_ASSIGN STA_NAMED_ASSIGN(params->sta); \
+ __entry->tid = params->tid; \
+ __entry->ssn = params->ssn; \
+ __entry->buf_size = params->buf_size; \
+ __entry->amsdu = params->amsdu; \
+ __entry->timeout = params->timeout;
+#define AMPDU_ACTION_PR_FMT STA_PR_FMT " tid %d, ssn %d, buf_size %u, amsdu %d, timeout %d"
+#define AMPDU_ACTION_PR_ARG STA_PR_ARG, __entry->tid, __entry->ssn, \
+ __entry->buf_size, __entry->amsdu, __entry->timeout
/*
* Tracing for driver callbacks.
@@ -970,38 +986,25 @@ DEFINE_EVENT(local_only_evt, drv_tx_last_beacon,
TRACE_EVENT(drv_ampdu_action,
TP_PROTO(struct ieee80211_local *local,
struct ieee80211_sub_if_data *sdata,
- enum ieee80211_ampdu_mlme_action action,
- struct ieee80211_sta *sta, u16 tid,
- u16 *ssn, u8 buf_size, bool amsdu),
+ struct ieee80211_ampdu_params *params),
- TP_ARGS(local, sdata, action, sta, tid, ssn, buf_size, amsdu),
+ TP_ARGS(local, sdata, params),
TP_STRUCT__entry(
LOCAL_ENTRY
- STA_ENTRY
- __field(u32, action)
- __field(u16, tid)
- __field(u16, ssn)
- __field(u8, buf_size)
- __field(bool, amsdu)
VIF_ENTRY
+ AMPDU_ACTION_ENTRY
),
TP_fast_assign(
LOCAL_ASSIGN;
VIF_ASSIGN;
- STA_ASSIGN;
- __entry->action = action;
- __entry->tid = tid;
- __entry->ssn = ssn ? *ssn : 0;
- __entry->buf_size = buf_size;
- __entry->amsdu = amsdu;
+ AMPDU_ACTION_ASSIGN;
),
TP_printk(
- LOCAL_PR_FMT VIF_PR_FMT STA_PR_FMT " action:%d tid:%d buf:%d amsdu:%d",
- LOCAL_PR_ARG, VIF_PR_ARG, STA_PR_ARG, __entry->action,
- __entry->tid, __entry->buf_size, __entry->amsdu
+ LOCAL_PR_FMT VIF_PR_FMT AMPDU_ACTION_PR_FMT,
+ LOCAL_PR_ARG, VIF_PR_ARG, AMPDU_ACTION_PR_ARG
)
);
@@ -2027,6 +2030,31 @@ TRACE_EVENT(api_eosp,
)
);
+TRACE_EVENT(api_send_eosp_nullfunc,
+ TP_PROTO(struct ieee80211_local *local,
+ struct ieee80211_sta *sta,
+ u8 tid),
+
+ TP_ARGS(local, sta, tid),
+
+ TP_STRUCT__entry(
+ LOCAL_ENTRY
+ STA_ENTRY
+ __field(u8, tid)
+ ),
+
+ TP_fast_assign(
+ LOCAL_ASSIGN;
+ STA_ASSIGN;
+ __entry->tid = tid;
+ ),
+
+ TP_printk(
+ LOCAL_PR_FMT STA_PR_FMT " tid:%d",
+ LOCAL_PR_ARG, STA_PR_ARG, __entry->tid
+ )
+);
+
TRACE_EVENT(api_sta_set_buffered,
TP_PROTO(struct ieee80211_local *local,
struct ieee80211_sta *sta,
diff --git a/net/mac80211/tx.c b/net/mac80211/tx.c
index bdc224d5053a..21f6602395f7 100644
--- a/net/mac80211/tx.c
+++ b/net/mac80211/tx.c
@@ -710,6 +710,10 @@ ieee80211_tx_h_rate_ctrl(struct ieee80211_tx_data *tx)
info->control.short_preamble = txrc.short_preamble;
+ /* don't ask rate control when rate already injected via radiotap */
+ if (info->control.flags & IEEE80211_TX_CTRL_RATE_INJECT)
+ return TX_CONTINUE;
+
if (tx->sta)
assoc = test_sta_flag(tx->sta, WLAN_STA_ASSOC);
@@ -1112,11 +1116,15 @@ static bool ieee80211_tx_prep_agg(struct ieee80211_tx_data *tx,
reset_agg_timer = true;
} else {
queued = true;
+ if (info->flags & IEEE80211_TX_CTL_NO_PS_BUFFER) {
+ clear_sta_flag(tx->sta, WLAN_STA_SP);
+ ps_dbg(tx->sta->sdata,
+ "STA %pM aid %d: SP frame queued, close the SP w/o telling the peer\n",
+ tx->sta->sta.addr, tx->sta->sta.aid);
+ }
info->control.vif = &tx->sdata->vif;
info->flags |= IEEE80211_TX_INTFL_NEED_TXPROCESSING;
- info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS |
- IEEE80211_TX_CTL_NO_PS_BUFFER |
- IEEE80211_TX_STATUS_EOSP;
+ info->flags &= ~IEEE80211_TX_TEMPORARY_FLAGS;
__skb_queue_tail(&tid_tx->pending, skb);
if (skb_queue_len(&tid_tx->pending) > STA_MAX_TX_BUFFER)
purge_skb = __skb_dequeue(&tid_tx->pending);
@@ -1243,7 +1251,8 @@ static void ieee80211_drv_tx(struct ieee80211_local *local,
struct txq_info *txqi;
u8 ac;
- if (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE)
+ if ((info->flags & IEEE80211_TX_CTL_SEND_AFTER_DTIM) ||
+ (info->control.flags & IEEE80211_TX_CTRL_PS_RESPONSE))
goto tx_normal;
if (!ieee80211_is_data(hdr->frame_control))
@@ -1266,7 +1275,11 @@ static void ieee80211_drv_tx(struct ieee80211_local *local,
if (atomic_read(&sdata->txqs_len[ac]) >= local->hw.txq_ac_max_pending)
netif_stop_subqueue(sdata->dev, ac);
- skb_queue_tail(&txqi->queue, skb);
+ spin_lock_bh(&txqi->queue.lock);
+ txqi->byte_cnt += skb->len;
+ __skb_queue_tail(&txqi->queue, skb);
+ spin_unlock_bh(&txqi->queue.lock);
+
drv_wake_tx_queue(local, txqi);
return;
@@ -1294,6 +1307,8 @@ struct sk_buff *ieee80211_tx_dequeue(struct ieee80211_hw *hw,
if (!skb)
goto out;
+ txqi->byte_cnt -= skb->len;
+
atomic_dec(&sdata->txqs_len[ac]);
if (__netif_subqueue_stopped(sdata->dev, ac))
ieee80211_propagate_queue_wake(local, sdata->vif.hw_queue[ac]);
@@ -1431,7 +1446,7 @@ static bool __ieee80211_tx(struct ieee80211_local *local,
info->hw_queue =
vif->hw_queue[skb_get_queue_mapping(skb)];
} else if (ieee80211_hw_check(&local->hw, QUEUE_CONTROL)) {
- dev_kfree_skb(skb);
+ ieee80211_purge_tx_queue(&local->hw, skbs);
return true;
} else
vif = NULL;
@@ -1665,15 +1680,24 @@ void ieee80211_xmit(struct ieee80211_sub_if_data *sdata,
ieee80211_tx(sdata, sta, skb, false);
}
-static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb)
+static bool ieee80211_parse_tx_radiotap(struct ieee80211_local *local,
+ struct sk_buff *skb)
{
struct ieee80211_radiotap_iterator iterator;
struct ieee80211_radiotap_header *rthdr =
(struct ieee80211_radiotap_header *) skb->data;
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
+ struct ieee80211_supported_band *sband =
+ local->hw.wiphy->bands[info->band];
int ret = ieee80211_radiotap_iterator_init(&iterator, rthdr, skb->len,
NULL);
u16 txflags;
+ u16 rate = 0;
+ bool rate_found = false;
+ u8 rate_retries = 0;
+ u16 rate_flags = 0;
+ u8 mcs_known, mcs_flags;
+ int i;
info->flags |= IEEE80211_TX_INTFL_DONT_ENCRYPT |
IEEE80211_TX_CTL_DONTFRAG;
@@ -1724,6 +1748,35 @@ static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb)
info->flags |= IEEE80211_TX_CTL_NO_ACK;
break;
+ case IEEE80211_RADIOTAP_RATE:
+ rate = *iterator.this_arg;
+ rate_flags = 0;
+ rate_found = true;
+ break;
+
+ case IEEE80211_RADIOTAP_DATA_RETRIES:
+ rate_retries = *iterator.this_arg;
+ break;
+
+ case IEEE80211_RADIOTAP_MCS:
+ mcs_known = iterator.this_arg[0];
+ mcs_flags = iterator.this_arg[1];
+ if (!(mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_MCS))
+ break;
+
+ rate_found = true;
+ rate = iterator.this_arg[2];
+ rate_flags = IEEE80211_TX_RC_MCS;
+
+ if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_GI &&
+ mcs_flags & IEEE80211_RADIOTAP_MCS_SGI)
+ rate_flags |= IEEE80211_TX_RC_SHORT_GI;
+
+ if (mcs_known & IEEE80211_RADIOTAP_MCS_HAVE_BW &&
+ mcs_flags & IEEE80211_RADIOTAP_MCS_BW_40)
+ rate_flags |= IEEE80211_TX_RC_40_MHZ_WIDTH;
+ break;
+
/*
* Please update the file
* Documentation/networking/mac80211-injection.txt
@@ -1738,6 +1791,32 @@ static bool ieee80211_parse_tx_radiotap(struct sk_buff *skb)
if (ret != -ENOENT) /* ie, if we didn't simply run out of fields */
return false;
+ if (rate_found) {
+ info->control.flags |= IEEE80211_TX_CTRL_RATE_INJECT;
+
+ for (i = 0; i < IEEE80211_TX_MAX_RATES; i++) {
+ info->control.rates[i].idx = -1;
+ info->control.rates[i].flags = 0;
+ info->control.rates[i].count = 0;
+ }
+
+ if (rate_flags & IEEE80211_TX_RC_MCS) {
+ info->control.rates[0].idx = rate;
+ } else {
+ for (i = 0; i < sband->n_bitrates; i++) {
+ if (rate * 5 != sband->bitrates[i].bitrate)
+ continue;
+
+ info->control.rates[0].idx = i;
+ break;
+ }
+ }
+
+ info->control.rates[0].flags = rate_flags;
+ info->control.rates[0].count = min_t(u8, rate_retries + 1,
+ local->hw.max_rate_tries);
+ }
+
/*
* remove the radiotap header
* iterator->_max_length was sanity-checked against
@@ -1818,10 +1897,6 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
info->flags = IEEE80211_TX_CTL_REQ_TX_STATUS |
IEEE80211_TX_CTL_INJECTED;
- /* process and remove the injection radiotap header */
- if (!ieee80211_parse_tx_radiotap(skb))
- goto fail;
-
rcu_read_lock();
/*
@@ -1883,6 +1958,11 @@ netdev_tx_t ieee80211_monitor_start_xmit(struct sk_buff *skb,
goto fail_rcu;
info->band = chandef->chan->band;
+
+ /* process and remove the injection radiotap header */
+ if (!ieee80211_parse_tx_radiotap(local, skb))
+ goto fail_rcu;
+
ieee80211_xmit(sdata, NULL, skb);
rcu_read_unlock();
@@ -2099,8 +2179,11 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
mpp_lookup = true;
}
- if (mpp_lookup)
+ if (mpp_lookup) {
mppath = mpp_path_lookup(sdata, skb->data);
+ if (mppath)
+ mppath->exp_time = jiffies;
+ }
if (mppath && mpath)
mesh_path_del(mpath->sdata, mpath->dst);
@@ -2380,7 +2463,7 @@ static struct sk_buff *ieee80211_build_hdr(struct ieee80211_sub_if_data *sdata,
/* Update skb pointers to various headers since this modified frame
* is going to go through Linux networking code that may potentially
* need things like pointer to IP header. */
- skb_set_mac_header(skb, 0);
+ skb_reset_mac_header(skb);
skb_set_network_header(skb, nh_pos);
skb_set_transport_header(skb, h_pos);
@@ -3895,9 +3978,9 @@ void __ieee80211_tx_skb_tid_band(struct ieee80211_sub_if_data *sdata,
{
int ac = ieee802_1d_to_ac[tid & 7];
- skb_set_mac_header(skb, 0);
- skb_set_network_header(skb, 0);
- skb_set_transport_header(skb, 0);
+ skb_reset_mac_header(skb);
+ skb_reset_network_header(skb);
+ skb_reset_transport_header(skb);
skb_set_queue_mapping(skb, ac);
skb->priority = tid;
diff --git a/net/mac80211/util.c b/net/mac80211/util.c
index 74058020b7d6..7390de4946a9 100644
--- a/net/mac80211/util.c
+++ b/net/mac80211/util.c
@@ -4,7 +4,7 @@
* Copyright 2006-2007 Jiri Benc <jbenc@suse.cz>
* Copyright 2007 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
- * Copyright (C) 2015 Intel Deutschland GmbH
+ * Copyright (C) 2015-2016 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -288,10 +288,13 @@ static void __ieee80211_wake_queue(struct ieee80211_hw *hw, int queue,
if (!test_bit(reason, &local->queue_stop_reasons[queue]))
return;
- if (!refcounted)
+ if (!refcounted) {
local->q_stop_reasons[queue][reason] = 0;
- else
+ } else {
local->q_stop_reasons[queue][reason]--;
+ if (WARN_ON(local->q_stop_reasons[queue][reason] < 0))
+ local->q_stop_reasons[queue][reason] = 0;
+ }
if (local->q_stop_reasons[queue][reason] == 0)
__clear_bit(reason, &local->queue_stop_reasons[queue]);
@@ -1641,6 +1644,29 @@ void ieee80211_stop_device(struct ieee80211_local *local)
drv_stop(local);
}
+static void ieee80211_flush_completed_scan(struct ieee80211_local *local,
+ bool aborted)
+{
+ /* It's possible that we don't handle the scan completion in
+ * time during suspend, so if it's still marked as completed
+ * here, queue the work and flush it to clean things up.
+ * Instead of calling the worker function directly here, we
+ * really queue it to avoid potential races with other flows
+ * scheduling the same work.
+ */
+ if (test_bit(SCAN_COMPLETED, &local->scanning)) {
+ /* If coming from reconfiguration failure, abort the scan so
+ * we don't attempt to continue a partial HW scan - which is
+ * possible otherwise if (e.g.) the 2.4 GHz portion was the
+ * completed scan, and a 5 GHz portion is still pending.
+ */
+ if (aborted)
+ set_bit(SCAN_ABORTED, &local->scanning);
+ ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0);
+ flush_delayed_work(&local->scan_work);
+ }
+}
+
static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local)
{
struct ieee80211_sub_if_data *sdata;
@@ -1660,6 +1686,8 @@ static void ieee80211_handle_reconfig_failure(struct ieee80211_local *local)
local->suspended = false;
local->in_reconfig = false;
+ ieee80211_flush_completed_scan(local, true);
+
/* scheduled scan clearly can't be running any more, but tell
* cfg80211 and clear local state
*/
@@ -1698,6 +1726,27 @@ static void ieee80211_assign_chanctx(struct ieee80211_local *local,
mutex_unlock(&local->chanctx_mtx);
}
+static void ieee80211_reconfig_stations(struct ieee80211_sub_if_data *sdata)
+{
+ struct ieee80211_local *local = sdata->local;
+ struct sta_info *sta;
+
+ /* add STAs back */
+ mutex_lock(&local->sta_mtx);
+ list_for_each_entry(sta, &local->sta_list, list) {
+ enum ieee80211_sta_state state;
+
+ if (!sta->uploaded || sta->sdata != sdata)
+ continue;
+
+ for (state = IEEE80211_STA_NOTEXIST;
+ state < sta->sta_state; state++)
+ WARN_ON(drv_sta_state(local, sta->sdata, sta, state,
+ state + 1));
+ }
+ mutex_unlock(&local->sta_mtx);
+}
+
int ieee80211_reconfig(struct ieee80211_local *local)
{
struct ieee80211_hw *hw = &local->hw;
@@ -1833,50 +1882,11 @@ int ieee80211_reconfig(struct ieee80211_local *local)
WARN_ON(drv_add_chanctx(local, ctx));
mutex_unlock(&local->chanctx_mtx);
- list_for_each_entry(sdata, &local->interfaces, list) {
- if (!ieee80211_sdata_running(sdata))
- continue;
- ieee80211_assign_chanctx(local, sdata);
- }
-
sdata = rtnl_dereference(local->monitor_sdata);
if (sdata && ieee80211_sdata_running(sdata))
ieee80211_assign_chanctx(local, sdata);
}
- /* add STAs back */
- mutex_lock(&local->sta_mtx);
- list_for_each_entry(sta, &local->sta_list, list) {
- enum ieee80211_sta_state state;
-
- if (!sta->uploaded)
- continue;
-
- /* AP-mode stations will be added later */
- if (sta->sdata->vif.type == NL80211_IFTYPE_AP)
- continue;
-
- for (state = IEEE80211_STA_NOTEXIST;
- state < sta->sta_state; state++)
- WARN_ON(drv_sta_state(local, sta->sdata, sta, state,
- state + 1));
- }
- mutex_unlock(&local->sta_mtx);
-
- /* reconfigure tx conf */
- if (hw->queues >= IEEE80211_NUM_ACS) {
- list_for_each_entry(sdata, &local->interfaces, list) {
- if (sdata->vif.type == NL80211_IFTYPE_AP_VLAN ||
- sdata->vif.type == NL80211_IFTYPE_MONITOR ||
- !ieee80211_sdata_running(sdata))
- continue;
-
- for (i = 0; i < IEEE80211_NUM_ACS; i++)
- drv_conf_tx(local, sdata, i,
- &sdata->tx_conf[i]);
- }
- }
-
/* reconfigure hardware */
ieee80211_hw_config(local, ~0);
@@ -1889,6 +1899,22 @@ int ieee80211_reconfig(struct ieee80211_local *local)
if (!ieee80211_sdata_running(sdata))
continue;
+ ieee80211_assign_chanctx(local, sdata);
+
+ switch (sdata->vif.type) {
+ case NL80211_IFTYPE_AP_VLAN:
+ case NL80211_IFTYPE_MONITOR:
+ break;
+ default:
+ ieee80211_reconfig_stations(sdata);
+ /* fall through */
+ case NL80211_IFTYPE_AP: /* AP stations are handled later */
+ for (i = 0; i < IEEE80211_NUM_ACS; i++)
+ drv_conf_tx(local, sdata, i,
+ &sdata->tx_conf[i]);
+ break;
+ }
+
/* common change flags for all interface types */
changed = BSS_CHANGED_ERP_CTS_PROT |
BSS_CHANGED_ERP_PREAMBLE |
@@ -1902,6 +1928,9 @@ int ieee80211_reconfig(struct ieee80211_local *local)
BSS_CHANGED_IDLE |
BSS_CHANGED_TXPOWER;
+ if (sdata->vif.mu_mimo_owner)
+ changed |= BSS_CHANGED_MU_GROUPS;
+
switch (sdata->vif.type) {
case NL80211_IFTYPE_STATION:
changed |= BSS_CHANGED_ASSOC |
@@ -2017,16 +2046,26 @@ int ieee80211_reconfig(struct ieee80211_local *local)
*/
if (sched_scan_req->n_scan_plans > 1 ||
__ieee80211_request_sched_scan_start(sched_scan_sdata,
- sched_scan_req))
+ sched_scan_req)) {
+ RCU_INIT_POINTER(local->sched_scan_sdata, NULL);
+ RCU_INIT_POINTER(local->sched_scan_req, NULL);
sched_scan_stopped = true;
+ }
mutex_unlock(&local->mtx);
if (sched_scan_stopped)
cfg80211_sched_scan_stopped_rtnl(local->hw.wiphy);
wake_up:
- local->in_reconfig = false;
- barrier();
+ if (local->in_reconfig) {
+ local->in_reconfig = false;
+ barrier();
+
+ /* Restart deferred ROCs */
+ mutex_lock(&local->mtx);
+ ieee80211_start_next_roc(local);
+ mutex_unlock(&local->mtx);
+ }
if (local->monitors == local->open_count && local->monitors > 0)
ieee80211_add_virtual_monitor(local);
@@ -2074,17 +2113,7 @@ int ieee80211_reconfig(struct ieee80211_local *local)
mb();
local->resuming = false;
- /* It's possible that we don't handle the scan completion in
- * time during suspend, so if it's still marked as completed
- * here, queue the work and flush it to clean things up.
- * Instead of calling the worker function directly here, we
- * really queue it to avoid potential races with other flows
- * scheduling the same work.
- */
- if (test_bit(SCAN_COMPLETED, &local->scanning)) {
- ieee80211_queue_delayed_work(&local->hw, &local->scan_work, 0);
- flush_delayed_work(&local->scan_work);
- }
+ ieee80211_flush_completed_scan(local, false);
if (local->open_count && !reconfig_due_to_wowlan)
drv_reconfig_complete(local, IEEE80211_RECONFIG_TYPE_SUSPEND);
@@ -2345,10 +2374,23 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
switch (chandef->width) {
case NL80211_CHAN_WIDTH_160:
- vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_160MHZ;
+ /*
+ * Convert 160 MHz channel width to new style as interop
+ * workaround.
+ */
+ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ;
+ vht_oper->center_freq_seg2_idx = vht_oper->center_freq_seg1_idx;
+ if (chandef->chan->center_freq < chandef->center_freq1)
+ vht_oper->center_freq_seg1_idx -= 8;
+ else
+ vht_oper->center_freq_seg1_idx += 8;
break;
case NL80211_CHAN_WIDTH_80P80:
- vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80P80MHZ;
+ /*
+ * Convert 80+80 MHz channel width to new style as interop
+ * workaround.
+ */
+ vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ;
break;
case NL80211_CHAN_WIDTH_80:
vht_oper->chan_width = IEEE80211_VHT_CHANWIDTH_80MHZ;
@@ -2364,17 +2406,13 @@ u8 *ieee80211_ie_build_vht_oper(u8 *pos, struct ieee80211_sta_vht_cap *vht_cap,
return pos + sizeof(struct ieee80211_vht_operation);
}
-void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan,
- const struct ieee80211_ht_operation *ht_oper,
- struct cfg80211_chan_def *chandef)
+bool ieee80211_chandef_ht_oper(const struct ieee80211_ht_operation *ht_oper,
+ struct cfg80211_chan_def *chandef)
{
enum nl80211_channel_type channel_type;
- if (!ht_oper) {
- cfg80211_chandef_create(chandef, control_chan,
- NL80211_CHAN_NO_HT);
- return;
- }
+ if (!ht_oper)
+ return false;
switch (ht_oper->ht_param & IEEE80211_HT_PARAM_CHA_SEC_OFFSET) {
case IEEE80211_HT_PARAM_CHA_SEC_NONE:
@@ -2388,42 +2426,66 @@ void ieee80211_ht_oper_to_chandef(struct ieee80211_channel *control_chan,
break;
default:
channel_type = NL80211_CHAN_NO_HT;
+ return false;
}
- cfg80211_chandef_create(chandef, control_chan, channel_type);
+ cfg80211_chandef_create(chandef, chandef->chan, channel_type);
+ return true;
}
-void ieee80211_vht_oper_to_chandef(struct ieee80211_channel *control_chan,
- const struct ieee80211_vht_operation *oper,
- struct cfg80211_chan_def *chandef)
+bool ieee80211_chandef_vht_oper(const struct ieee80211_vht_operation *oper,
+ struct cfg80211_chan_def *chandef)
{
+ struct cfg80211_chan_def new = *chandef;
+ int cf1, cf2;
+
if (!oper)
- return;
+ return false;
- chandef->chan = control_chan;
+ cf1 = ieee80211_channel_to_frequency(oper->center_freq_seg1_idx,
+ chandef->chan->band);
+ cf2 = ieee80211_channel_to_frequency(oper->center_freq_seg2_idx,
+ chandef->chan->band);
switch (oper->chan_width) {
case IEEE80211_VHT_CHANWIDTH_USE_HT:
break;
case IEEE80211_VHT_CHANWIDTH_80MHZ:
- chandef->width = NL80211_CHAN_WIDTH_80;
+ new.width = NL80211_CHAN_WIDTH_80;
+ new.center_freq1 = cf1;
+ /* If needed, adjust based on the newer interop workaround. */
+ if (oper->center_freq_seg2_idx) {
+ unsigned int diff;
+
+ diff = abs(oper->center_freq_seg2_idx -
+ oper->center_freq_seg1_idx);
+ if (diff == 8) {
+ new.width = NL80211_CHAN_WIDTH_160;
+ new.center_freq1 = cf2;
+ } else if (diff > 8) {
+ new.width = NL80211_CHAN_WIDTH_80P80;
+ new.center_freq2 = cf2;
+ }
+ }
break;
case IEEE80211_VHT_CHANWIDTH_160MHZ:
- chandef->width = NL80211_CHAN_WIDTH_160;
+ new.width = NL80211_CHAN_WIDTH_160;
+ new.center_freq1 = cf1;
break;
case IEEE80211_VHT_CHANWIDTH_80P80MHZ:
- chandef->width = NL80211_CHAN_WIDTH_80P80;
+ new.width = NL80211_CHAN_WIDTH_80P80;
+ new.center_freq1 = cf1;
+ new.center_freq2 = cf2;
break;
default:
- break;
+ return false;
}
- chandef->center_freq1 =
- ieee80211_channel_to_frequency(oper->center_freq_seg1_idx,
- control_chan->band);
- chandef->center_freq2 =
- ieee80211_channel_to_frequency(oper->center_freq_seg2_idx,
- control_chan->band);
+ if (!cfg80211_chandef_valid(&new))
+ return false;
+
+ *chandef = new;
+ return true;
}
int ieee80211_parse_bitrates(struct cfg80211_chan_def *chandef,
@@ -2646,6 +2708,18 @@ u64 ieee80211_calculate_rx_timestamp(struct ieee80211_local *local,
sband = local->hw.wiphy->bands[status->band];
bitrate = sband->bitrates[status->rate_idx].bitrate;
ri.legacy = DIV_ROUND_UP(bitrate, (1 << shift));
+
+ if (status->flag & RX_FLAG_MACTIME_PLCP_START) {
+ /* TODO: handle HT/VHT preambles */
+ if (status->band == IEEE80211_BAND_5GHZ) {
+ ts += 20 << shift;
+ mpdu_offset += 2;
+ } else if (status->flag & RX_FLAG_SHORTPRE) {
+ ts += 96;
+ } else {
+ ts += 192;
+ }
+ }
}
rate = cfg80211_calculate_bitrate(&ri);
@@ -3331,3 +3405,17 @@ void ieee80211_init_tx_queue(struct ieee80211_sub_if_data *sdata,
txqi->txq.ac = IEEE80211_AC_BE;
}
}
+
+void ieee80211_txq_get_depth(struct ieee80211_txq *txq,
+ unsigned long *frame_cnt,
+ unsigned long *byte_cnt)
+{
+ struct txq_info *txqi = to_txq_info(txq);
+
+ if (frame_cnt)
+ *frame_cnt = txqi->queue.qlen;
+
+ if (byte_cnt)
+ *byte_cnt = txqi->byte_cnt;
+}
+EXPORT_SYMBOL(ieee80211_txq_get_depth);
diff --git a/net/mac80211/vht.c b/net/mac80211/vht.c
index ff1c798921a6..e590e2ef9eaf 100644
--- a/net/mac80211/vht.c
+++ b/net/mac80211/vht.c
@@ -1,6 +1,9 @@
/*
* VHT handling
*
+ * Portions of this file
+ * Copyright(c) 2015 - 2016 Intel Deutschland GmbH
+ *
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
@@ -278,6 +281,23 @@ ieee80211_vht_cap_ie_to_sta_vht_cap(struct ieee80211_sub_if_data *sdata,
}
sta->sta.bandwidth = ieee80211_sta_cur_vht_bw(sta);
+
+ /* If HT IE reported 3839 bytes only, stay with that size. */
+ if (sta->sta.max_amsdu_len == IEEE80211_MAX_MPDU_LEN_HT_3839)
+ return;
+
+ switch (vht_cap->cap & IEEE80211_VHT_CAP_MAX_MPDU_MASK) {
+ case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_11454:
+ sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_11454;
+ break;
+ case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_7991:
+ sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_7991;
+ break;
+ case IEEE80211_VHT_CAP_MAX_MPDU_LENGTH_3895:
+ default:
+ sta->sta.max_amsdu_len = IEEE80211_MAX_MPDU_LEN_VHT_3895;
+ break;
+ }
}
enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta)
@@ -299,7 +319,30 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cap_rx_bw(struct sta_info *sta)
return IEEE80211_STA_RX_BW_80;
}
-static enum ieee80211_sta_rx_bandwidth
+enum nl80211_chan_width ieee80211_sta_cap_chan_bw(struct sta_info *sta)
+{
+ struct ieee80211_sta_vht_cap *vht_cap = &sta->sta.vht_cap;
+ u32 cap_width;
+
+ if (!vht_cap->vht_supported) {
+ if (!sta->sta.ht_cap.ht_supported)
+ return NL80211_CHAN_WIDTH_20_NOHT;
+
+ return sta->sta.ht_cap.cap & IEEE80211_HT_CAP_SUP_WIDTH_20_40 ?
+ NL80211_CHAN_WIDTH_40 : NL80211_CHAN_WIDTH_20;
+ }
+
+ cap_width = vht_cap->cap & IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_MASK;
+
+ if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160MHZ)
+ return NL80211_CHAN_WIDTH_160;
+ else if (cap_width == IEEE80211_VHT_CAP_SUPP_CHAN_WIDTH_160_80PLUS80MHZ)
+ return NL80211_CHAN_WIDTH_80P80;
+
+ return NL80211_CHAN_WIDTH_80;
+}
+
+enum ieee80211_sta_rx_bandwidth
ieee80211_chan_width_to_rx_bw(enum nl80211_chan_width width)
{
switch (width) {
@@ -327,10 +370,7 @@ enum ieee80211_sta_rx_bandwidth ieee80211_sta_cur_vht_bw(struct sta_info *sta)
bw = ieee80211_sta_cap_rx_bw(sta);
bw = min(bw, sta->cur_max_bandwidth);
-
- /* do not cap the BW of TDLS WIDER_BW peers by the bss */
- if (!test_sta_flag(sta, WLAN_STA_TDLS_WIDER_BW))
- bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width));
+ bw = min(bw, ieee80211_chan_width_to_rx_bw(bss_width));
return bw;
}
@@ -378,7 +418,7 @@ void ieee80211_sta_set_rx_nss(struct sta_info *sta)
u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band, bool nss_only)
+ enum ieee80211_band band)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband;
@@ -401,9 +441,6 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
changed |= IEEE80211_RC_NSS_CHANGED;
}
- if (nss_only)
- return changed;
-
switch (opmode & IEEE80211_OPMODE_NOTIF_CHANWIDTH_MASK) {
case IEEE80211_OPMODE_NOTIF_CHANWIDTH_20MHZ:
sta->cur_max_bandwidth = IEEE80211_STA_RX_BW_20;
@@ -428,15 +465,51 @@ u32 __ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
return changed;
}
+void ieee80211_process_mu_groups(struct ieee80211_sub_if_data *sdata,
+ struct ieee80211_mgmt *mgmt)
+{
+ struct ieee80211_bss_conf *bss_conf = &sdata->vif.bss_conf;
+
+ if (!sdata->vif.mu_mimo_owner)
+ return;
+
+ if (!memcmp(mgmt->u.action.u.vht_group_notif.position,
+ bss_conf->mu_group.position, WLAN_USER_POSITION_LEN) &&
+ !memcmp(mgmt->u.action.u.vht_group_notif.membership,
+ bss_conf->mu_group.membership, WLAN_MEMBERSHIP_LEN))
+ return;
+
+ memcpy(bss_conf->mu_group.membership,
+ mgmt->u.action.u.vht_group_notif.membership,
+ WLAN_MEMBERSHIP_LEN);
+ memcpy(bss_conf->mu_group.position,
+ mgmt->u.action.u.vht_group_notif.position,
+ WLAN_USER_POSITION_LEN);
+
+ ieee80211_bss_info_change_notify(sdata, BSS_CHANGED_MU_GROUPS);
+}
+
+void ieee80211_update_mu_groups(struct ieee80211_vif *vif,
+ const u8 *membership, const u8 *position)
+{
+ struct ieee80211_bss_conf *bss_conf = &vif->bss_conf;
+
+ if (WARN_ON_ONCE(!vif->mu_mimo_owner))
+ return;
+
+ memcpy(bss_conf->mu_group.membership, membership, WLAN_MEMBERSHIP_LEN);
+ memcpy(bss_conf->mu_group.position, position, WLAN_USER_POSITION_LEN);
+}
+EXPORT_SYMBOL_GPL(ieee80211_update_mu_groups);
+
void ieee80211_vht_handle_opmode(struct ieee80211_sub_if_data *sdata,
struct sta_info *sta, u8 opmode,
- enum ieee80211_band band, bool nss_only)
+ enum ieee80211_band band)
{
struct ieee80211_local *local = sdata->local;
struct ieee80211_supported_band *sband = local->hw.wiphy->bands[band];
- u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode,
- band, nss_only);
+ u32 changed = __ieee80211_vht_handle_opmode(sdata, sta, opmode, band);
if (changed > 0)
rate_control_rate_update(local, sband, sta, changed);
diff --git a/net/mac80211/wpa.c b/net/mac80211/wpa.c
index d824c38971ed..18848258adde 100644
--- a/net/mac80211/wpa.c
+++ b/net/mac80211/wpa.c
@@ -1,6 +1,7 @@
/*
* Copyright 2002-2004, Instant802 Networks, Inc.
* Copyright 2008, Jouni Malinen <j@w1.fi>
+ * Copyright (C) 2016 Intel Deutschland GmbH
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
@@ -183,7 +184,6 @@ mic_fail_no_key:
return RX_DROP_UNUSABLE;
}
-
static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
{
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
@@ -191,6 +191,7 @@ static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
struct ieee80211_tx_info *info = IEEE80211_SKB_CB(skb);
unsigned int hdrlen;
int len, tail;
+ u64 pn;
u8 *pos;
if (info->control.hw_key &&
@@ -222,12 +223,8 @@ static int tkip_encrypt_skb(struct ieee80211_tx_data *tx, struct sk_buff *skb)
return 0;
/* Increase IV for the frame */
- spin_lock(&key->u.tkip.txlock);
- key->u.tkip.tx.iv16++;
- if (key->u.tkip.tx.iv16 == 0)
- key->u.tkip.tx.iv32++;
- pos = ieee80211_tkip_add_iv(pos, key);
- spin_unlock(&key->u.tkip.txlock);
+ pn = atomic64_inc_return(&key->conf.tx_pn);
+ pos = ieee80211_tkip_add_iv(pos, &key->conf, pn);
/* hwaccel - with software IV */
if (info->control.hw_key)
diff --git a/net/mac802154/driver-ops.h b/net/mac802154/driver-ops.h
index 0550f3365e33..fd9daf2ecec9 100644
--- a/net/mac802154/driver-ops.h
+++ b/net/mac802154/driver-ops.h
@@ -18,9 +18,6 @@ drv_xmit_async(struct ieee802154_local *local, struct sk_buff *skb)
static inline int
drv_xmit_sync(struct ieee802154_local *local, struct sk_buff *skb)
{
- /* don't allow other operations while sync xmit */
- ASSERT_RTNL();
-
might_sleep();
return local->ops->xmit_sync(&local->hw, skb);
diff --git a/net/mac802154/llsec.c b/net/mac802154/llsec.c
index a13d02b7cee4..6a3e1c2181d3 100644
--- a/net/mac802154/llsec.c
+++ b/net/mac802154/llsec.c
@@ -17,9 +17,9 @@
#include <linux/err.h>
#include <linux/bug.h>
#include <linux/completion.h>
-#include <linux/crypto.h>
#include <linux/ieee802154.h>
#include <crypto/aead.h>
+#include <crypto/skcipher.h>
#include "ieee802154_i.h"
#include "llsec.h"
@@ -144,18 +144,18 @@ llsec_key_alloc(const struct ieee802154_llsec_key *template)
goto err_tfm;
}
- key->tfm0 = crypto_alloc_blkcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
+ key->tfm0 = crypto_alloc_skcipher("ctr(aes)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(key->tfm0))
goto err_tfm;
- if (crypto_blkcipher_setkey(key->tfm0, template->key,
- IEEE802154_LLSEC_KEY_SIZE))
+ if (crypto_skcipher_setkey(key->tfm0, template->key,
+ IEEE802154_LLSEC_KEY_SIZE))
goto err_tfm0;
return key;
err_tfm0:
- crypto_free_blkcipher(key->tfm0);
+ crypto_free_skcipher(key->tfm0);
err_tfm:
for (i = 0; i < ARRAY_SIZE(key->tfm); i++)
if (key->tfm[i])
@@ -175,7 +175,7 @@ static void llsec_key_release(struct kref *ref)
for (i = 0; i < ARRAY_SIZE(key->tfm); i++)
crypto_free_aead(key->tfm[i]);
- crypto_free_blkcipher(key->tfm0);
+ crypto_free_skcipher(key->tfm0);
kzfree(key);
}
@@ -620,15 +620,17 @@ llsec_do_encrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec,
{
u8 iv[16];
struct scatterlist src;
- struct blkcipher_desc req = {
- .tfm = key->tfm0,
- .info = iv,
- .flags = 0,
- };
+ SKCIPHER_REQUEST_ON_STACK(req, key->tfm0);
+ int err;
llsec_geniv(iv, sec->params.hwaddr, &hdr->sec);
sg_init_one(&src, skb->data, skb->len);
- return crypto_blkcipher_encrypt_iv(&req, &src, &src, skb->len);
+ skcipher_request_set_tfm(req, key->tfm0);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &src, &src, skb->len, iv);
+ err = crypto_skcipher_encrypt(req);
+ skcipher_request_zero(req);
+ return err;
}
static struct crypto_aead*
@@ -830,11 +832,8 @@ llsec_do_decrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec,
unsigned char *data;
int datalen;
struct scatterlist src;
- struct blkcipher_desc req = {
- .tfm = key->tfm0,
- .info = iv,
- .flags = 0,
- };
+ SKCIPHER_REQUEST_ON_STACK(req, key->tfm0);
+ int err;
llsec_geniv(iv, dev_addr, &hdr->sec);
data = skb_mac_header(skb) + skb->mac_len;
@@ -842,7 +841,13 @@ llsec_do_decrypt_unauth(struct sk_buff *skb, const struct mac802154_llsec *sec,
sg_init_one(&src, data, datalen);
- return crypto_blkcipher_decrypt_iv(&req, &src, &src, datalen);
+ skcipher_request_set_tfm(req, key->tfm0);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &src, &src, datalen, iv);
+
+ err = crypto_skcipher_decrypt(req);
+ skcipher_request_zero(req);
+ return err;
}
static int
diff --git a/net/mac802154/llsec.h b/net/mac802154/llsec.h
index 950578e1d7be..6f3b658e3279 100644
--- a/net/mac802154/llsec.h
+++ b/net/mac802154/llsec.h
@@ -19,7 +19,6 @@
#include <linux/slab.h>
#include <linux/hashtable.h>
-#include <linux/crypto.h>
#include <linux/kref.h>
#include <linux/spinlock.h>
#include <net/af_ieee802154.h>
@@ -30,7 +29,7 @@ struct mac802154_llsec_key {
/* one tfm for each authsize (4/8/16) */
struct crypto_aead *tfm[3];
- struct crypto_blkcipher *tfm0;
+ struct crypto_skcipher *tfm0;
struct kref ref;
};
diff --git a/net/mac802154/mac_cmd.c b/net/mac802154/mac_cmd.c
index 8606da459ff3..3db16346cab3 100644
--- a/net/mac802154/mac_cmd.c
+++ b/net/mac802154/mac_cmd.c
@@ -126,7 +126,7 @@ static void mac802154_get_mac_params(struct net_device *dev,
params->lbt = wpan_dev->lbt;
}
-static struct ieee802154_llsec_ops mac802154_llsec_ops = {
+static const struct ieee802154_llsec_ops mac802154_llsec_ops = {
.get_params = mac802154_get_params,
.set_params = mac802154_set_params,
.add_key = mac802154_add_key,
diff --git a/net/mac802154/main.c b/net/mac802154/main.c
index e8cab5bb80c6..87da85ae5a6b 100644
--- a/net/mac802154/main.c
+++ b/net/mac802154/main.c
@@ -218,7 +218,6 @@ void ieee802154_unregister_hw(struct ieee802154_hw *hw)
tasklet_kill(&local->tasklet);
flush_workqueue(local->workqueue);
- destroy_workqueue(local->workqueue);
rtnl_lock();
@@ -226,6 +225,7 @@ void ieee802154_unregister_hw(struct ieee802154_hw *hw)
rtnl_unlock();
+ destroy_workqueue(local->workqueue);
wpan_phy_unregister(local->phy);
}
EXPORT_SYMBOL(ieee802154_unregister_hw);
diff --git a/net/mac802154/rx.c b/net/mac802154/rx.c
index 42e96729dae6..446e1300383e 100644
--- a/net/mac802154/rx.c
+++ b/net/mac802154/rx.c
@@ -217,8 +217,7 @@ __ieee802154_rx_handle_packet(struct ieee802154_local *local,
break;
}
- if (skb)
- kfree_skb(skb);
+ kfree_skb(skb);
}
static void
diff --git a/net/mac802154/tx.c b/net/mac802154/tx.c
index 3827f359b336..7e253455f9dd 100644
--- a/net/mac802154/tx.c
+++ b/net/mac802154/tx.c
@@ -38,12 +38,6 @@ void ieee802154_xmit_worker(struct work_struct *work)
struct net_device *dev = skb->dev;
int res;
- rtnl_lock();
-
- /* check if ifdown occurred while schedule */
- if (!netif_running(dev))
- goto err_tx;
-
res = drv_xmit_sync(local, skb);
if (res)
goto err_tx;
@@ -53,14 +47,11 @@ void ieee802154_xmit_worker(struct work_struct *work)
dev->stats.tx_packets++;
dev->stats.tx_bytes += skb->len;
- rtnl_unlock();
-
return;
err_tx:
/* Restart the netif queue on each sub_if_data object. */
ieee802154_wake_queue(&local->hw);
- rtnl_unlock();
kfree_skb(skb);
netdev_dbg(dev, "transmission failed\n");
}
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index c70d750148b6..0b80a7140cc4 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -27,6 +27,8 @@
*/
#define MAX_MP_SELECT_LABELS 4
+#define MPLS_NEIGH_TABLE_UNSPEC (NEIGH_LINK_TABLE + 1)
+
static int zero = 0;
static int label_limit = (1 << 20) - 1;
@@ -96,22 +98,15 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned int mtu)
}
EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
-static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
- struct sk_buff *skb, bool bos)
+static u32 mpls_multipath_hash(struct mpls_route *rt,
+ struct sk_buff *skb, bool bos)
{
struct mpls_entry_decoded dec;
struct mpls_shim_hdr *hdr;
bool eli_seen = false;
int label_index;
- int nh_index = 0;
u32 hash = 0;
- /* No need to look further into packet if there's only
- * one path
- */
- if (rt->rt_nhn == 1)
- goto out;
-
for (label_index = 0; label_index < MAX_MP_SELECT_LABELS && !bos;
label_index++) {
if (!pskb_may_pull(skb, sizeof(*hdr) * label_index))
@@ -165,7 +160,38 @@ static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
}
}
- nh_index = hash % rt->rt_nhn;
+ return hash;
+}
+
+static struct mpls_nh *mpls_select_multipath(struct mpls_route *rt,
+ struct sk_buff *skb, bool bos)
+{
+ int alive = ACCESS_ONCE(rt->rt_nhn_alive);
+ u32 hash = 0;
+ int nh_index = 0;
+ int n = 0;
+
+ /* No need to look further into packet if there's only
+ * one path
+ */
+ if (rt->rt_nhn == 1)
+ goto out;
+
+ if (alive <= 0)
+ return NULL;
+
+ hash = mpls_multipath_hash(rt, skb, bos);
+ nh_index = hash % alive;
+ if (alive == rt->rt_nhn)
+ goto out;
+ for_nexthops(rt) {
+ if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
+ continue;
+ if (n == nh_index)
+ return nh;
+ n++;
+ } endfor_nexthops(rt);
+
out:
return &rt->rt_nh[nh_index];
}
@@ -317,7 +343,13 @@ static int mpls_forward(struct sk_buff *skb, struct net_device *dev,
}
}
- err = neigh_xmit(nh->nh_via_table, out_dev, mpls_nh_via(rt, nh), skb);
+ /* If via wasn't specified then send out using device address */
+ if (nh->nh_via_table == MPLS_NEIGH_TABLE_UNSPEC)
+ err = neigh_xmit(NEIGH_LINK_TABLE, out_dev,
+ out_dev->dev_addr, skb);
+ else
+ err = neigh_xmit(nh->nh_via_table, out_dev,
+ mpls_nh_via(rt, nh), skb);
if (err)
net_dbg_ratelimited("%s: packet transmission failed: %d\n",
__func__, err);
@@ -365,6 +397,7 @@ static struct mpls_route *mpls_rt_alloc(int num_nh, u8 max_alen)
GFP_KERNEL);
if (rt) {
rt->rt_nhn = num_nh;
+ rt->rt_nhn_alive = num_nh;
rt->rt_max_alen = max_alen_aligned;
}
@@ -510,6 +543,9 @@ static struct net_device *find_outdev(struct net *net,
if (!dev)
return ERR_PTR(-ENODEV);
+ if (IS_ERR(dev))
+ return dev;
+
/* The caller is holding rtnl anyways, so release the dev reference */
dev_put(dev);
@@ -534,8 +570,22 @@ static int mpls_nh_assign_dev(struct net *net, struct mpls_route *rt,
if (!mpls_dev_get(dev))
goto errout;
+ if ((nh->nh_via_table == NEIGH_LINK_TABLE) &&
+ (dev->addr_len != nh->nh_via_alen))
+ goto errout;
+
RCU_INIT_POINTER(nh->nh_dev, dev);
+ if (!(dev->flags & IFF_UP)) {
+ nh->nh_flags |= RTNH_F_DEAD;
+ } else {
+ unsigned int flags;
+
+ flags = dev_get_flags(dev);
+ if (!(flags & (IFF_RUNNING | IFF_LOWER_UP)))
+ nh->nh_flags |= RTNH_F_LINKDOWN;
+ }
+
return 0;
errout:
@@ -570,6 +620,9 @@ static int mpls_nh_build_from_cfg(struct mpls_route_config *cfg,
if (err)
goto errout;
+ if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
+ rt->rt_nhn_alive--;
+
return 0;
errout:
@@ -577,8 +630,8 @@ errout:
}
static int mpls_nh_build(struct net *net, struct mpls_route *rt,
- struct mpls_nh *nh, int oif,
- struct nlattr *via, struct nlattr *newdst)
+ struct mpls_nh *nh, int oif, struct nlattr *via,
+ struct nlattr *newdst)
{
int err = -ENOMEM;
@@ -592,10 +645,14 @@ static int mpls_nh_build(struct net *net, struct mpls_route *rt,
goto errout;
}
- err = nla_get_via(via, &nh->nh_via_alen, &nh->nh_via_table,
- __mpls_nh_via(rt, nh));
- if (err)
- goto errout;
+ if (via) {
+ err = nla_get_via(via, &nh->nh_via_alen, &nh->nh_via_table,
+ __mpls_nh_via(rt, nh));
+ if (err)
+ goto errout;
+ } else {
+ nh->nh_via_table = MPLS_NEIGH_TABLE_UNSPEC;
+ }
err = mpls_nh_assign_dev(net, rt, nh, oif);
if (err)
@@ -677,15 +734,14 @@ static int mpls_nh_build_multi(struct mpls_route_config *cfg,
nla_newdst = nla_find(attrs, attrlen, RTA_NEWDST);
}
- if (!nla_via)
- goto errout;
-
err = mpls_nh_build(cfg->rc_nlinfo.nl_net, rt, nh,
- rtnh->rtnh_ifindex, nla_via,
- nla_newdst);
+ rtnh->rtnh_ifindex, nla_via, nla_newdst);
if (err)
goto errout;
+ if (nh->nh_flags & (RTNH_F_DEAD | RTNH_F_LINKDOWN))
+ rt->rt_nhn_alive--;
+
rtnh = rtnh_next(rtnh, &remaining);
nhs++;
} endfor_nexthops(rt);
@@ -875,34 +931,74 @@ free:
return ERR_PTR(err);
}
-static void mpls_ifdown(struct net_device *dev)
+static void mpls_ifdown(struct net_device *dev, int event)
{
struct mpls_route __rcu **platform_label;
struct net *net = dev_net(dev);
- struct mpls_dev *mdev;
unsigned index;
platform_label = rtnl_dereference(net->mpls.platform_label);
for (index = 0; index < net->mpls.platform_labels; index++) {
struct mpls_route *rt = rtnl_dereference(platform_label[index]);
+
if (!rt)
continue;
- for_nexthops(rt) {
+
+ change_nexthops(rt) {
if (rtnl_dereference(nh->nh_dev) != dev)
continue;
- nh->nh_dev = NULL;
+ switch (event) {
+ case NETDEV_DOWN:
+ case NETDEV_UNREGISTER:
+ nh->nh_flags |= RTNH_F_DEAD;
+ /* fall through */
+ case NETDEV_CHANGE:
+ nh->nh_flags |= RTNH_F_LINKDOWN;
+ ACCESS_ONCE(rt->rt_nhn_alive) = rt->rt_nhn_alive - 1;
+ break;
+ }
+ if (event == NETDEV_UNREGISTER)
+ RCU_INIT_POINTER(nh->nh_dev, NULL);
} endfor_nexthops(rt);
}
- mdev = mpls_dev_get(dev);
- if (!mdev)
- return;
- mpls_dev_sysctl_unregister(mdev);
+ return;
+}
+
+static void mpls_ifup(struct net_device *dev, unsigned int nh_flags)
+{
+ struct mpls_route __rcu **platform_label;
+ struct net *net = dev_net(dev);
+ unsigned index;
+ int alive;
+
+ platform_label = rtnl_dereference(net->mpls.platform_label);
+ for (index = 0; index < net->mpls.platform_labels; index++) {
+ struct mpls_route *rt = rtnl_dereference(platform_label[index]);
+
+ if (!rt)
+ continue;
+
+ alive = 0;
+ change_nexthops(rt) {
+ struct net_device *nh_dev =
+ rtnl_dereference(nh->nh_dev);
- RCU_INIT_POINTER(dev->mpls_ptr, NULL);
+ if (!(nh->nh_flags & nh_flags)) {
+ alive++;
+ continue;
+ }
+ if (nh_dev != dev)
+ continue;
+ alive++;
+ nh->nh_flags &= ~nh_flags;
+ } endfor_nexthops(rt);
+
+ ACCESS_ONCE(rt->rt_nhn_alive) = alive;
+ }
- kfree_rcu(mdev, rcu);
+ return;
}
static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
@@ -910,9 +1006,9 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct mpls_dev *mdev;
+ unsigned int flags;
- switch(event) {
- case NETDEV_REGISTER:
+ if (event == NETDEV_REGISTER) {
/* For now just support ethernet devices */
if ((dev->type == ARPHRD_ETHER) ||
(dev->type == ARPHRD_LOOPBACK)) {
@@ -920,10 +1016,39 @@ static int mpls_dev_notify(struct notifier_block *this, unsigned long event,
if (IS_ERR(mdev))
return notifier_from_errno(PTR_ERR(mdev));
}
- break;
+ return NOTIFY_OK;
+ }
+
+ mdev = mpls_dev_get(dev);
+ if (!mdev)
+ return NOTIFY_OK;
+ switch (event) {
+ case NETDEV_DOWN:
+ mpls_ifdown(dev, event);
+ break;
+ case NETDEV_UP:
+ flags = dev_get_flags(dev);
+ if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+ mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
+ else
+ mpls_ifup(dev, RTNH_F_DEAD);
+ break;
+ case NETDEV_CHANGE:
+ flags = dev_get_flags(dev);
+ if (flags & (IFF_RUNNING | IFF_LOWER_UP))
+ mpls_ifup(dev, RTNH_F_DEAD | RTNH_F_LINKDOWN);
+ else
+ mpls_ifdown(dev, event);
+ break;
case NETDEV_UNREGISTER:
- mpls_ifdown(dev);
+ mpls_ifdown(dev, event);
+ mdev = mpls_dev_get(dev);
+ if (mdev) {
+ mpls_dev_sysctl_unregister(mdev);
+ RCU_INIT_POINTER(dev->mpls_ptr, NULL);
+ kfree_rcu(mdev, rcu);
+ }
break;
case NETDEV_CHANGENAME:
mdev = mpls_dev_get(dev);
@@ -1118,6 +1243,7 @@ static int rtm_to_route_config(struct sk_buff *skb, struct nlmsghdr *nlh,
cfg->rc_label = LABEL_NOT_SPECIFIED;
cfg->rc_protocol = rtm->rtm_protocol;
+ cfg->rc_via_table = MPLS_NEIGH_TABLE_UNSPEC;
cfg->rc_nlflags = nlh->nlmsg_flags;
cfg->rc_nlinfo.portid = NETLINK_CB(skb).portid;
cfg->rc_nlinfo.nlh = nlh;
@@ -1231,15 +1357,22 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
nla_put_labels(skb, RTA_NEWDST, nh->nh_labels,
nh->nh_label))
goto nla_put_failure;
- if (nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh),
+ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC &&
+ nla_put_via(skb, nh->nh_via_table, mpls_nh_via(rt, nh),
nh->nh_via_alen))
goto nla_put_failure;
dev = rtnl_dereference(nh->nh_dev);
if (dev && nla_put_u32(skb, RTA_OIF, dev->ifindex))
goto nla_put_failure;
+ if (nh->nh_flags & RTNH_F_LINKDOWN)
+ rtm->rtm_flags |= RTNH_F_LINKDOWN;
+ if (nh->nh_flags & RTNH_F_DEAD)
+ rtm->rtm_flags |= RTNH_F_DEAD;
} else {
struct rtnexthop *rtnh;
struct nlattr *mp;
+ int dead = 0;
+ int linkdown = 0;
mp = nla_nest_start(skb, RTA_MULTIPATH);
if (!mp)
@@ -1253,11 +1386,21 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
dev = rtnl_dereference(nh->nh_dev);
if (dev)
rtnh->rtnh_ifindex = dev->ifindex;
+ if (nh->nh_flags & RTNH_F_LINKDOWN) {
+ rtnh->rtnh_flags |= RTNH_F_LINKDOWN;
+ linkdown++;
+ }
+ if (nh->nh_flags & RTNH_F_DEAD) {
+ rtnh->rtnh_flags |= RTNH_F_DEAD;
+ dead++;
+ }
+
if (nh->nh_labels && nla_put_labels(skb, RTA_NEWDST,
nh->nh_labels,
nh->nh_label))
goto nla_put_failure;
- if (nla_put_via(skb, nh->nh_via_table,
+ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC &&
+ nla_put_via(skb, nh->nh_via_table,
mpls_nh_via(rt, nh),
nh->nh_via_alen))
goto nla_put_failure;
@@ -1266,6 +1409,11 @@ static int mpls_dump_route(struct sk_buff *skb, u32 portid, u32 seq, int event,
rtnh->rtnh_len = nlmsg_get_pos(skb) - (void *)rtnh;
} endfor_nexthops(rt);
+ if (linkdown == rt->rt_nhn)
+ rtm->rtm_flags |= RTNH_F_LINKDOWN;
+ if (dead == rt->rt_nhn)
+ rtm->rtm_flags |= RTNH_F_DEAD;
+
nla_nest_end(skb, mp);
}
@@ -1319,7 +1467,8 @@ static inline size_t lfib_nlmsg_size(struct mpls_route *rt)
if (nh->nh_dev)
payload += nla_total_size(4); /* RTA_OIF */
- payload += nla_total_size(2 + nh->nh_via_alen); /* RTA_VIA */
+ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC) /* RTA_VIA */
+ payload += nla_total_size(2 + nh->nh_via_alen);
if (nh->nh_labels) /* RTA_NEWDST */
payload += nla_total_size(nh->nh_labels * 4);
} else {
@@ -1328,7 +1477,9 @@ static inline size_t lfib_nlmsg_size(struct mpls_route *rt)
for_nexthops(rt) {
nhsize += nla_total_size(sizeof(struct rtnexthop));
- nhsize += nla_total_size(2 + nh->nh_via_alen);
+ /* RTA_VIA */
+ if (nh->nh_via_table != MPLS_NEIGH_TABLE_UNSPEC)
+ nhsize += nla_total_size(2 + nh->nh_via_alen);
if (nh->nh_labels)
nhsize += nla_total_size(nh->nh_labels * 4);
} endfor_nexthops(rt);
diff --git a/net/mpls/internal.h b/net/mpls/internal.h
index bde52ce88c94..732a5c17e986 100644
--- a/net/mpls/internal.h
+++ b/net/mpls/internal.h
@@ -41,6 +41,7 @@ enum mpls_payload_type {
struct mpls_nh { /* next hop label forwarding entry */
struct net_device __rcu *nh_dev;
+ unsigned int nh_flags;
u32 nh_label[MAX_NEW_LABELS];
u8 nh_labels;
u8 nh_via_alen;
@@ -74,6 +75,7 @@ struct mpls_route { /* next hop label forwarding entry */
u8 rt_payload_type;
u8 rt_max_alen;
unsigned int rt_nhn;
+ unsigned int rt_nhn_alive;
struct mpls_nh rt_nh[0];
};
diff --git a/net/mpls/mpls_iptunnel.c b/net/mpls/mpls_iptunnel.c
index 67591aef9cae..644a8da6d4bd 100644
--- a/net/mpls/mpls_iptunnel.c
+++ b/net/mpls/mpls_iptunnel.c
@@ -37,7 +37,7 @@ static unsigned int mpls_encap_size(struct mpls_iptunnel_encap *en)
return en->labels * sizeof(struct mpls_shim_hdr);
}
-int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb)
+static int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb)
{
struct mpls_iptunnel_encap *tun_encap_info;
struct mpls_shim_hdr *hdr;
@@ -54,10 +54,10 @@ int mpls_output(struct net *net, struct sock *sk, struct sk_buff *skb)
unsigned int ttl;
/* Obtain the ttl */
- if (skb->protocol == htons(ETH_P_IP)) {
+ if (dst->ops->family == AF_INET) {
ttl = ip_hdr(skb)->ttl;
rt = (struct rtable *)dst;
- } else if (skb->protocol == htons(ETH_P_IPV6)) {
+ } else if (dst->ops->family == AF_INET6) {
ttl = ipv6_hdr(skb)->hop_limit;
rt6 = (struct rt6_info *)dst;
} else {
@@ -227,5 +227,6 @@ static void __exit mpls_iptunnel_exit(void)
}
module_exit(mpls_iptunnel_exit);
+MODULE_ALIAS_RTNL_LWT(MPLS);
MODULE_DESCRIPTION("MultiProtocol Label Switching IP Tunnels");
MODULE_LICENSE("GPL v2");
diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index e22349ea7256..95e757c377f9 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -563,6 +563,28 @@ config NFT_COMPAT
x_tables match/target extensions over the nf_tables
framework.
+if NF_TABLES_NETDEV
+
+config NF_DUP_NETDEV
+ tristate "Netfilter packet duplication support"
+ help
+ This option enables the generic packet duplication infrastructure
+ for Netfilter.
+
+config NFT_DUP_NETDEV
+ tristate "Netfilter nf_tables netdev packet duplication support"
+ select NF_DUP_NETDEV
+ help
+ This option enables packet duplication for the "netdev" family.
+
+config NFT_FWD_NETDEV
+ tristate "Netfilter nf_tables netdev packet forwarding support"
+ select NF_DUP_NETDEV
+ help
+ This option enables packet forwarding for the "netdev" family.
+
+endif # NF_TABLES_NETDEV
+
endif # NF_TABLES
config NETFILTER_XTABLES
@@ -869,7 +891,7 @@ config NETFILTER_XT_TARGET_TEE
depends on IPV6 || IPV6=n
depends on !NF_CONNTRACK || NF_CONNTRACK
select NF_DUP_IPV4
- select NF_DUP_IPV6 if IP6_NF_IPTABLES
+ select NF_DUP_IPV6 if IPV6
---help---
This option adds a "TEE" target with which a packet can be cloned and
this clone be rerouted to another nexthop.
@@ -882,7 +904,7 @@ config NETFILTER_XT_TARGET_TPROXY
depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
depends on IP_NF_MANGLE
select NF_DEFRAG_IPV4
- select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
+ select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
help
This option adds a `TPROXY' target, which is somewhat similar to
REDIRECT. It can only be used in the mangle table and is useful
@@ -1375,7 +1397,7 @@ config NETFILTER_XT_MATCH_SOCKET
depends on IPV6 || IPV6=n
depends on IP6_NF_IPTABLES || IP6_NF_IPTABLES=n
select NF_DEFRAG_IPV4
- select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES
+ select NF_DEFRAG_IPV6 if IP6_NF_IPTABLES != n
help
This option adds a `socket' match, which can be used to match
packets for which a TCP or UDP socket lookup finds a valid socket.
diff --git a/net/netfilter/Makefile b/net/netfilter/Makefile
index 7638c36b498c..69134541d65b 100644
--- a/net/netfilter/Makefile
+++ b/net/netfilter/Makefile
@@ -66,8 +66,11 @@ obj-$(CONFIG_NF_NAT_TFTP) += nf_nat_tftp.o
# SYNPROXY
obj-$(CONFIG_NETFILTER_SYNPROXY) += nf_synproxy_core.o
+# generic packet duplication from netdev family
+obj-$(CONFIG_NF_DUP_NETDEV) += nf_dup_netdev.o
+
# nf_tables
-nf_tables-objs += nf_tables_core.o nf_tables_api.o
+nf_tables-objs += nf_tables_core.o nf_tables_api.o nf_tables_trace.o
nf_tables-objs += nft_immediate.o nft_cmp.o nft_lookup.o nft_dynset.o
nf_tables-objs += nft_bitwise.o nft_byteorder.o nft_payload.o
@@ -90,6 +93,10 @@ obj-$(CONFIG_NFT_LOG) += nft_log.o
obj-$(CONFIG_NFT_MASQ) += nft_masq.o
obj-$(CONFIG_NFT_REDIR) += nft_redir.o
+# nf_tables netdev
+obj-$(CONFIG_NFT_DUP_NETDEV) += nft_dup_netdev.o
+obj-$(CONFIG_NFT_FWD_NETDEV) += nft_fwd_netdev.o
+
# generic X tables
obj-$(CONFIG_NETFILTER_XTABLES) += x_tables.o xt_tcpudp.o
diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h
index d05e759ed0fa..2e8e7e5fb4a6 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -33,7 +33,7 @@
#define mtype_gc IPSET_TOKEN(MTYPE, _gc)
#define mtype MTYPE
-#define get_ext(set, map, id) ((map)->extensions + (set)->dsize * (id))
+#define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id)))
static void
mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
@@ -67,12 +67,9 @@ mtype_destroy(struct ip_set *set)
del_timer_sync(&map->gc);
ip_set_free(map->members);
- if (set->dsize) {
- if (set->extensions & IPSET_EXT_DESTROY)
- mtype_ext_cleanup(set);
- ip_set_free(map->extensions);
- }
- kfree(map);
+ if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
+ mtype_ext_cleanup(set);
+ ip_set_free(map);
set->data = NULL;
}
@@ -92,16 +89,14 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
{
const struct mtype *map = set->data;
struct nlattr *nested;
+ size_t memsize = sizeof(*map) + map->memsize;
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
if (mtype_do_head(skb, map) ||
- nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
- nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
- htonl(sizeof(*map) +
- map->memsize +
- set->dsize * map->elements)))
+ nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
+ nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
goto nla_put_failure;
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 64a564334418..4783efff0bde 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -41,7 +41,6 @@ MODULE_ALIAS("ip_set_bitmap:ip");
/* Type structure */
struct bitmap_ip {
void *members; /* the set members */
- void *extensions; /* data extensions */
u32 first_ip; /* host byte order, included in range */
u32 last_ip; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
@@ -49,6 +48,8 @@ struct bitmap_ip {
size_t memsize; /* members size */
u8 netmask; /* subnet netmask */
struct timer_list gc; /* garbage collection */
+ unsigned char extensions[0] /* data extensions */
+ __aligned(__alignof__(u64));
};
/* ADT structure for generic function args */
@@ -224,13 +225,6 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map,
map->members = ip_set_alloc(map->memsize);
if (!map->members)
return false;
- if (set->dsize) {
- map->extensions = ip_set_alloc(set->dsize * elements);
- if (!map->extensions) {
- kfree(map->members);
- return false;
- }
- }
map->first_ip = first_ip;
map->last_ip = last_ip;
map->elements = elements;
@@ -316,13 +310,13 @@ bitmap_ip_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
pr_debug("hosts %u, elements %llu\n",
hosts, (unsigned long long)elements);
- map = kzalloc(sizeof(*map), GFP_KERNEL);
+ set->dsize = ip_set_elem_len(set, tb, 0, 0);
+ map = ip_set_alloc(sizeof(*map) + elements * set->dsize);
if (!map)
return -ENOMEM;
map->memsize = bitmap_bytes(0, elements - 1);
set->variant = &bitmap_ip;
- set->dsize = ip_set_elem_len(set, tb, 0);
if (!init_map_ip(set, map, first_ip, last_ip,
elements, hosts, netmask)) {
kfree(map);
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index 1430535118fb..9a065f672d3a 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -47,24 +47,26 @@ enum {
/* Type structure */
struct bitmap_ipmac {
void *members; /* the set members */
- void *extensions; /* MAC + data extensions */
u32 first_ip; /* host byte order, included in range */
u32 last_ip; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
size_t memsize; /* members size */
struct timer_list gc; /* garbage collector */
+ unsigned char extensions[0] /* MAC + data extensions */
+ __aligned(__alignof__(u64));
};
/* ADT structure for generic function args */
struct bitmap_ipmac_adt_elem {
+ unsigned char ether[ETH_ALEN] __aligned(2);
u16 id;
- unsigned char *ether;
+ u16 add_mac;
};
struct bitmap_ipmac_elem {
unsigned char ether[ETH_ALEN];
unsigned char filled;
-} __attribute__ ((aligned));
+} __aligned(__alignof__(u64));
static inline u32
ip_to_id(const struct bitmap_ipmac *m, u32 ip)
@@ -72,11 +74,11 @@ ip_to_id(const struct bitmap_ipmac *m, u32 ip)
return ip - m->first_ip;
}
-static inline struct bitmap_ipmac_elem *
-get_elem(void *extensions, u16 id, size_t dsize)
-{
- return (struct bitmap_ipmac_elem *)(extensions + id * dsize);
-}
+#define get_elem(extensions, id, dsize) \
+ (struct bitmap_ipmac_elem *)(extensions + (id) * (dsize))
+
+#define get_const_elem(extensions, id, dsize) \
+ (const struct bitmap_ipmac_elem *)(extensions + (id) * (dsize))
/* Common functions */
@@ -88,10 +90,9 @@ bitmap_ipmac_do_test(const struct bitmap_ipmac_adt_elem *e,
if (!test_bit(e->id, map->members))
return 0;
- elem = get_elem(map->extensions, e->id, dsize);
- if (elem->filled == MAC_FILLED)
- return !e->ether ||
- ether_addr_equal(e->ether, elem->ether);
+ elem = get_const_elem(map->extensions, e->id, dsize);
+ if (e->add_mac && elem->filled == MAC_FILLED)
+ return ether_addr_equal(e->ether, elem->ether);
/* Trigger kernel to fill out the ethernet address */
return -EAGAIN;
}
@@ -103,7 +104,7 @@ bitmap_ipmac_gc_test(u16 id, const struct bitmap_ipmac *map, size_t dsize)
if (!test_bit(id, map->members))
return 0;
- elem = get_elem(map->extensions, id, dsize);
+ elem = get_const_elem(map->extensions, id, dsize);
/* Timer not started for the incomplete elements */
return elem->filled == MAC_FILLED;
}
@@ -133,7 +134,7 @@ bitmap_ipmac_add_timeout(unsigned long *timeout,
* and we can reuse it later when MAC is filled out,
* possibly by the kernel
*/
- if (e->ether)
+ if (e->add_mac)
ip_set_timeout_set(timeout, t);
else
*timeout = t;
@@ -150,7 +151,7 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e,
elem = get_elem(map->extensions, e->id, dsize);
if (test_bit(e->id, map->members)) {
if (elem->filled == MAC_FILLED) {
- if (e->ether &&
+ if (e->add_mac &&
(flags & IPSET_FLAG_EXIST) &&
!ether_addr_equal(e->ether, elem->ether)) {
/* memcpy isn't atomic */
@@ -159,7 +160,7 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e,
ether_addr_copy(elem->ether, e->ether);
}
return IPSET_ADD_FAILED;
- } else if (!e->ether)
+ } else if (!e->add_mac)
/* Already added without ethernet address */
return IPSET_ADD_FAILED;
/* Fill the MAC address and trigger the timer activation */
@@ -168,7 +169,7 @@ bitmap_ipmac_do_add(const struct bitmap_ipmac_adt_elem *e,
ether_addr_copy(elem->ether, e->ether);
elem->filled = MAC_FILLED;
return IPSET_ADD_START_STORED_TIMEOUT;
- } else if (e->ether) {
+ } else if (e->add_mac) {
/* We can store MAC too */
ether_addr_copy(elem->ether, e->ether);
elem->filled = MAC_FILLED;
@@ -191,7 +192,7 @@ bitmap_ipmac_do_list(struct sk_buff *skb, const struct bitmap_ipmac *map,
u32 id, size_t dsize)
{
const struct bitmap_ipmac_elem *elem =
- get_elem(map->extensions, id, dsize);
+ get_const_elem(map->extensions, id, dsize);
return nla_put_ipaddr4(skb, IPSET_ATTR_IP,
htonl(map->first_ip + id)) ||
@@ -213,7 +214,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
{
struct bitmap_ipmac *map = set->data;
ipset_adtfn adtfn = set->variant->adt[adt];
- struct bitmap_ipmac_adt_elem e = { .id = 0 };
+ struct bitmap_ipmac_adt_elem e = { .id = 0, .add_mac = 1 };
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
u32 ip;
@@ -231,7 +232,7 @@ bitmap_ipmac_kadt(struct ip_set *set, const struct sk_buff *skb,
return -EINVAL;
e.id = ip_to_id(map, ip);
- e.ether = eth_hdr(skb)->h_source;
+ memcpy(e.ether, eth_hdr(skb)->h_source, ETH_ALEN);
return adtfn(set, &e, &ext, &opt->ext, opt->cmdflags);
}
@@ -265,11 +266,12 @@ bitmap_ipmac_uadt(struct ip_set *set, struct nlattr *tb[],
return -IPSET_ERR_BITMAP_RANGE;
e.id = ip_to_id(map, ip);
- if (tb[IPSET_ATTR_ETHER])
- e.ether = nla_data(tb[IPSET_ATTR_ETHER]);
- else
- e.ether = NULL;
-
+ if (tb[IPSET_ATTR_ETHER]) {
+ if (nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN)
+ return -IPSET_ERR_PROTOCOL;
+ memcpy(e.ether, nla_data(tb[IPSET_ATTR_ETHER]), ETH_ALEN);
+ e.add_mac = 1;
+ }
ret = adtfn(set, &e, &ext, &ext, flags);
return ip_set_eexist(ret, flags) ? 0 : ret;
@@ -300,13 +302,6 @@ init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,
map->members = ip_set_alloc(map->memsize);
if (!map->members)
return false;
- if (set->dsize) {
- map->extensions = ip_set_alloc(set->dsize * elements);
- if (!map->extensions) {
- kfree(map->members);
- return false;
- }
- }
map->first_ip = first_ip;
map->last_ip = last_ip;
map->elements = elements;
@@ -361,14 +356,15 @@ bitmap_ipmac_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
if (elements > IPSET_BITMAP_MAX_RANGE + 1)
return -IPSET_ERR_BITMAP_RANGE_SIZE;
- map = kzalloc(sizeof(*map), GFP_KERNEL);
+ set->dsize = ip_set_elem_len(set, tb,
+ sizeof(struct bitmap_ipmac_elem),
+ __alignof__(struct bitmap_ipmac_elem));
+ map = ip_set_alloc(sizeof(*map) + elements * set->dsize);
if (!map)
return -ENOMEM;
map->memsize = bitmap_bytes(0, elements - 1);
set->variant = &bitmap_ipmac;
- set->dsize = ip_set_elem_len(set, tb,
- sizeof(struct bitmap_ipmac_elem));
if (!init_map_ipmac(set, map, first_ip, last_ip, elements)) {
kfree(map);
return -ENOMEM;
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c
index 5338ccd5da46..7f0c733358a4 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -35,12 +35,13 @@ MODULE_ALIAS("ip_set_bitmap:port");
/* Type structure */
struct bitmap_port {
void *members; /* the set members */
- void *extensions; /* data extensions */
u16 first_port; /* host byte order, included in range */
u16 last_port; /* host byte order, included in range */
u32 elements; /* number of max elements in the set */
size_t memsize; /* members size */
struct timer_list gc; /* garbage collection */
+ unsigned char extensions[0] /* data extensions */
+ __aligned(__alignof__(u64));
};
/* ADT structure for generic function args */
@@ -209,13 +210,6 @@ init_map_port(struct ip_set *set, struct bitmap_port *map,
map->members = ip_set_alloc(map->memsize);
if (!map->members)
return false;
- if (set->dsize) {
- map->extensions = ip_set_alloc(set->dsize * map->elements);
- if (!map->extensions) {
- kfree(map->members);
- return false;
- }
- }
map->first_port = first_port;
map->last_port = last_port;
set->timeout = IPSET_NO_TIMEOUT;
@@ -232,6 +226,7 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
{
struct bitmap_port *map;
u16 first_port, last_port;
+ u32 elements;
if (unlikely(!ip_set_attr_netorder(tb, IPSET_ATTR_PORT) ||
!ip_set_attr_netorder(tb, IPSET_ATTR_PORT_TO) ||
@@ -248,14 +243,15 @@ bitmap_port_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
last_port = tmp;
}
- map = kzalloc(sizeof(*map), GFP_KERNEL);
+ elements = last_port - first_port + 1;
+ set->dsize = ip_set_elem_len(set, tb, 0, 0);
+ map = ip_set_alloc(sizeof(*map) + elements * set->dsize);
if (!map)
return -ENOMEM;
- map->elements = last_port - first_port + 1;
+ map->elements = elements;
map->memsize = bitmap_bytes(0, map->elements);
set->variant = &bitmap_port;
- set->dsize = ip_set_elem_len(set, tb, 0);
if (!init_map_port(set, map, first_port, last_port)) {
kfree(map);
return -ENOMEM;
diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c
index 69ab9c2634e1..a748b0c2c981 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -364,25 +364,27 @@ add_extension(enum ip_set_ext_id id, u32 flags, struct nlattr *tb[])
}
size_t
-ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len)
+ip_set_elem_len(struct ip_set *set, struct nlattr *tb[], size_t len,
+ size_t align)
{
enum ip_set_ext_id id;
- size_t offset = len;
u32 cadt_flags = 0;
if (tb[IPSET_ATTR_CADT_FLAGS])
cadt_flags = ip_set_get_h32(tb[IPSET_ATTR_CADT_FLAGS]);
if (cadt_flags & IPSET_FLAG_WITH_FORCEADD)
set->flags |= IPSET_CREATE_FLAG_FORCEADD;
+ if (!align)
+ align = 1;
for (id = 0; id < IPSET_EXT_ID_MAX; id++) {
if (!add_extension(id, cadt_flags, tb))
continue;
- offset = ALIGN(offset, ip_set_extensions[id].align);
- set->offset[id] = offset;
+ len = ALIGN(len, ip_set_extensions[id].align);
+ set->offset[id] = len;
set->extensions |= ip_set_extensions[id].type;
- offset += ip_set_extensions[id].len;
+ len += ip_set_extensions[id].len;
}
- return offset;
+ return ALIGN(len, align);
}
EXPORT_SYMBOL_GPL(ip_set_elem_len);
@@ -495,6 +497,26 @@ __ip_set_put(struct ip_set *set)
write_unlock_bh(&ip_set_ref_lock);
}
+/* set->ref can be swapped out by ip_set_swap, netlink events (like dump) need
+ * a separate reference counter
+ */
+static inline void
+__ip_set_get_netlink(struct ip_set *set)
+{
+ write_lock_bh(&ip_set_ref_lock);
+ set->ref_netlink++;
+ write_unlock_bh(&ip_set_ref_lock);
+}
+
+static inline void
+__ip_set_put_netlink(struct ip_set *set)
+{
+ write_lock_bh(&ip_set_ref_lock);
+ BUG_ON(set->ref_netlink == 0);
+ set->ref_netlink--;
+ write_unlock_bh(&ip_set_ref_lock);
+}
+
/* Add, del and test set entries from kernel.
*
* The set behind the index must exist and must be referenced
@@ -823,20 +845,17 @@ find_free_id(struct ip_set_net *inst, const char *name, ip_set_id_t *index,
return 0;
}
-static int
-ip_set_none(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[])
+static int ip_set_none(struct net *net, struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
{
return -EOPNOTSUPP;
}
-static int
-ip_set_create(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[])
+static int ip_set_create(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
{
- struct net *net = sock_net(ctnl);
struct ip_set_net *inst = ip_set_pernet(net);
struct ip_set *set, *clash = NULL;
ip_set_id_t index = IPSET_INVALID_ID;
@@ -974,12 +993,11 @@ ip_set_destroy_set(struct ip_set *set)
kfree(set);
}
-static int
-ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[])
+static int ip_set_destroy(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set_net *inst = ip_set_pernet(net);
struct ip_set *s;
ip_set_id_t i;
int ret = 0;
@@ -987,6 +1005,9 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
if (unlikely(protocol_failed(attr)))
return -IPSET_ERR_PROTOCOL;
+ /* Must wait for flush to be really finished in list:set */
+ rcu_barrier();
+
/* Commands are serialized and references are
* protected by the ip_set_ref_lock.
* External systems (i.e. xt_set) must call
@@ -1001,7 +1022,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
if (!attr[IPSET_ATTR_SETNAME]) {
for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i);
- if (s && s->ref) {
+ if (s && (s->ref || s->ref_netlink)) {
ret = -IPSET_ERR_BUSY;
goto out;
}
@@ -1023,7 +1044,7 @@ ip_set_destroy(struct sock *ctnl, struct sk_buff *skb,
if (!s) {
ret = -ENOENT;
goto out;
- } else if (s->ref) {
+ } else if (s->ref || s->ref_netlink) {
ret = -IPSET_ERR_BUSY;
goto out;
}
@@ -1050,12 +1071,11 @@ ip_set_flush_set(struct ip_set *set)
spin_unlock_bh(&set->lock);
}
-static int
-ip_set_flush(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[])
+static int ip_set_flush(struct net *net, struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set_net *inst = ip_set_pernet(net);
struct ip_set *s;
ip_set_id_t i;
@@ -1090,12 +1110,11 @@ ip_set_setname2_policy[IPSET_ATTR_CMD_MAX + 1] = {
.len = IPSET_MAXNAMELEN - 1 },
};
-static int
-ip_set_rename(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[])
+static int ip_set_rename(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set_net *inst = ip_set_pernet(net);
struct ip_set *set, *s;
const char *name2;
ip_set_id_t i;
@@ -1140,12 +1159,11 @@ out:
* so the ip_set_list always contains valid pointers to the sets.
*/
-static int
-ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[])
+static int ip_set_swap(struct net *net, struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set_net *inst = ip_set_pernet(net);
struct ip_set *from, *to;
ip_set_id_t from_id, to_id;
char from_name[IPSET_MAXNAMELEN];
@@ -1173,6 +1191,9 @@ ip_set_swap(struct sock *ctnl, struct sk_buff *skb,
from->family == to->family))
return -IPSET_ERR_TYPE_MISMATCH;
+ if (from->ref_netlink || to->ref_netlink)
+ return -EBUSY;
+
strncpy(from_name, from->name, IPSET_MAXNAMELEN);
strncpy(from->name, to->name, IPSET_MAXNAMELEN);
strncpy(to->name, from_name, IPSET_MAXNAMELEN);
@@ -1208,7 +1229,7 @@ ip_set_dump_done(struct netlink_callback *cb)
if (set->variant->uref)
set->variant->uref(set, cb, false);
pr_debug("release set %s\n", set->name);
- __ip_set_put_byindex(inst, index);
+ __ip_set_put_netlink(set);
}
return 0;
}
@@ -1330,7 +1351,7 @@ dump_last:
if (!cb->args[IPSET_CB_ARG0]) {
/* Start listing: make sure set won't be destroyed */
pr_debug("reference set\n");
- set->ref++;
+ set->ref_netlink++;
}
write_unlock_bh(&ip_set_ref_lock);
nlh = start_msg(skb, NETLINK_CB(cb->skb).portid,
@@ -1398,7 +1419,7 @@ release_refcount:
if (set->variant->uref)
set->variant->uref(set, cb, false);
pr_debug("release set %s\n", set->name);
- __ip_set_put_byindex(inst, index);
+ __ip_set_put_netlink(set);
cb->args[IPSET_CB_ARG0] = 0;
}
out:
@@ -1411,10 +1432,9 @@ out:
return ret < 0 ? ret : skb->len;
}
-static int
-ip_set_dump(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[])
+static int ip_set_dump(struct net *net, struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
{
if (unlikely(protocol_failed(attr)))
return -IPSET_ERR_PROTOCOL;
@@ -1498,12 +1518,11 @@ call_ad(struct sock *ctnl, struct sk_buff *skb, struct ip_set *set,
return ret;
}
-static int
-ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[])
+static int ip_set_uadd(struct net *net, struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set_net *inst = ip_set_pernet(net);
struct ip_set *set;
struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
const struct nlattr *nla;
@@ -1553,12 +1572,11 @@ ip_set_uadd(struct sock *ctnl, struct sk_buff *skb,
return ret;
}
-static int
-ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[])
+static int ip_set_udel(struct net *net, struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set_net *inst = ip_set_pernet(net);
struct ip_set *set;
struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
const struct nlattr *nla;
@@ -1608,12 +1626,11 @@ ip_set_udel(struct sock *ctnl, struct sk_buff *skb,
return ret;
}
-static int
-ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[])
+static int ip_set_utest(struct net *net, struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set_net *inst = ip_set_pernet(net);
struct ip_set *set;
struct nlattr *tb[IPSET_ATTR_ADT_MAX + 1] = {};
int ret = 0;
@@ -1644,12 +1661,11 @@ ip_set_utest(struct sock *ctnl, struct sk_buff *skb,
/* Get headed data of a set */
-static int
-ip_set_header(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[])
+static int ip_set_header(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
{
- struct ip_set_net *inst = ip_set_pernet(sock_net(ctnl));
+ struct ip_set_net *inst = ip_set_pernet(net);
const struct ip_set *set;
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
@@ -1701,10 +1717,9 @@ static const struct nla_policy ip_set_type_policy[IPSET_ATTR_CMD_MAX + 1] = {
[IPSET_ATTR_FAMILY] = { .type = NLA_U8 },
};
-static int
-ip_set_type(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[])
+static int ip_set_type(struct net *net, struct sock *ctnl, struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
{
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
@@ -1760,10 +1775,9 @@ ip_set_protocol_policy[IPSET_ATTR_CMD_MAX + 1] = {
[IPSET_ATTR_PROTOCOL] = { .type = NLA_U8 },
};
-static int
-ip_set_protocol(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const attr[])
+static int ip_set_protocol(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const attr[])
{
struct sk_buff *skb2;
struct nlmsghdr *nlh2;
diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h
index 691b54fcaf2a..d32fd6b036bf 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -72,8 +72,9 @@ struct hbucket {
DECLARE_BITMAP(used, AHASH_MAX_TUNED);
u8 size; /* size of the array */
u8 pos; /* position of the first free entry */
- unsigned char value[0]; /* the array of the values */
-} __attribute__ ((aligned));
+ unsigned char value[0] /* the array of the values */
+ __aligned(__alignof__(u64));
+};
/* The hash table: the table size stored here in order to make resizing easy */
struct htable {
@@ -475,7 +476,7 @@ static void
mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
{
struct htable *t;
- struct hbucket *n;
+ struct hbucket *n, *tmp;
struct mtype_elem *data;
u32 i, j, d;
#ifdef IP_SET_HASH_WITH_NETS
@@ -510,9 +511,14 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
}
}
if (d >= AHASH_INIT_SIZE) {
- struct hbucket *tmp = kzalloc(sizeof(*tmp) +
- (n->size - AHASH_INIT_SIZE) * dsize,
- GFP_ATOMIC);
+ if (d >= n->size) {
+ rcu_assign_pointer(hbucket(t, i), NULL);
+ kfree_rcu(n, rcu);
+ continue;
+ }
+ tmp = kzalloc(sizeof(*tmp) +
+ (n->size - AHASH_INIT_SIZE) * dsize,
+ GFP_ATOMIC);
if (!tmp)
/* Still try to delete expired elements */
continue;
@@ -522,7 +528,7 @@ mtype_expire(struct ip_set *set, struct htype *h, u8 nets_length, size_t dsize)
continue;
data = ahash_data(n, j, dsize);
memcpy(tmp->value + d * dsize, data, dsize);
- set_bit(j, tmp->used);
+ set_bit(d, tmp->used);
d++;
}
tmp->pos = d;
@@ -1076,7 +1082,7 @@ mtype_head(struct ip_set *set, struct sk_buff *skb)
if (nla_put_u32(skb, IPSET_ATTR_MARKMASK, h->markmask))
goto nla_put_failure;
#endif
- if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+ if (nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE, htonl(memsize)))
goto nla_put_failure;
if (unlikely(ip_set_put_flags(skb, set)))
@@ -1323,12 +1329,14 @@ IPSET_TOKEN(HTYPE, _create)(struct net *net, struct ip_set *set,
#endif
set->variant = &IPSET_TOKEN(HTYPE, 4_variant);
set->dsize = ip_set_elem_len(set, tb,
- sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)));
+ sizeof(struct IPSET_TOKEN(HTYPE, 4_elem)),
+ __alignof__(struct IPSET_TOKEN(HTYPE, 4_elem)));
#ifndef IP_SET_PROTO_UNDEF
} else {
set->variant = &IPSET_TOKEN(HTYPE, 6_variant);
set->dsize = ip_set_elem_len(set, tb,
- sizeof(struct IPSET_TOKEN(HTYPE, 6_elem)));
+ sizeof(struct IPSET_TOKEN(HTYPE, 6_elem)),
+ __alignof__(struct IPSET_TOKEN(HTYPE, 6_elem)));
}
#endif
if (tb[IPSET_ATTR_TIMEOUT]) {
diff --git a/net/netfilter/ipset/ip_set_hash_mac.c b/net/netfilter/ipset/ip_set_hash_mac.c
index f1e7d2c0f685..8f004edad396 100644
--- a/net/netfilter/ipset/ip_set_hash_mac.c
+++ b/net/netfilter/ipset/ip_set_hash_mac.c
@@ -110,7 +110,8 @@ hash_mac4_uadt(struct ip_set *set, struct nlattr *tb[],
if (tb[IPSET_ATTR_LINENO])
*lineno = nla_get_u32(tb[IPSET_ATTR_LINENO]);
- if (unlikely(!tb[IPSET_ATTR_ETHER]))
+ if (unlikely(!tb[IPSET_ATTR_ETHER] ||
+ nla_len(tb[IPSET_ATTR_ETHER]) != ETH_ALEN))
return -IPSET_ERR_PROTOCOL;
ret = ip_set_get_extensions(set, tb, &ext);
diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c b/net/netfilter/ipset/ip_set_hash_netiface.c
index 43d8c9896fa3..f0f688db6213 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -164,8 +164,6 @@ hash_netiface4_kadt(struct ip_set *set, const struct sk_buff *skb,
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- if (e.cidr == 0)
- return -EINVAL;
if (adt == IPSET_TEST)
e.cidr = HOST_MASK;
@@ -377,8 +375,6 @@ hash_netiface6_kadt(struct ip_set *set, const struct sk_buff *skb,
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
- if (e.cidr == 0)
- return -EINVAL;
if (adt == IPSET_TEST)
e.cidr = HOST_MASK;
diff --git a/net/netfilter/ipset/ip_set_list_set.c b/net/netfilter/ipset/ip_set_list_set.c
index 5a30ce6e8c90..a2a89e4e0a14 100644
--- a/net/netfilter/ipset/ip_set_list_set.c
+++ b/net/netfilter/ipset/ip_set_list_set.c
@@ -30,8 +30,9 @@ MODULE_ALIAS("ip_set_list:set");
struct set_elem {
struct rcu_head rcu;
struct list_head list;
+ struct ip_set *set; /* Sigh, in order to cleanup reference */
ip_set_id_t id;
-};
+} __aligned(__alignof__(u64));
struct set_adt_elem {
ip_set_id_t id;
@@ -151,30 +152,29 @@ list_set_kadt(struct ip_set *set, const struct sk_buff *skb,
/* Userspace interfaces: we are protected by the nfnl mutex */
static void
-__list_set_del(struct ip_set *set, struct set_elem *e)
+__list_set_del_rcu(struct rcu_head * rcu)
{
+ struct set_elem *e = container_of(rcu, struct set_elem, rcu);
+ struct ip_set *set = e->set;
struct list_set *map = set->data;
ip_set_put_byindex(map->net, e->id);
- /* We may call it, because we don't have a to be destroyed
- * extension which is used by the kernel.
- */
ip_set_ext_destroy(set, e);
- kfree_rcu(e, rcu);
+ kfree(e);
}
static inline void
list_set_del(struct ip_set *set, struct set_elem *e)
{
list_del_rcu(&e->list);
- __list_set_del(set, e);
+ call_rcu(&e->rcu, __list_set_del_rcu);
}
static inline void
-list_set_replace(struct ip_set *set, struct set_elem *e, struct set_elem *old)
+list_set_replace(struct set_elem *e, struct set_elem *old)
{
list_replace_rcu(&old->list, &e->list);
- __list_set_del(set, old);
+ call_rcu(&old->rcu, __list_set_del_rcu);
}
static void
@@ -244,9 +244,6 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct set_elem *e, *n, *prev, *next;
bool flag_exist = flags & IPSET_FLAG_EXIST;
- if (SET_WITH_TIMEOUT(set))
- set_cleanup_entries(set);
-
/* Find where to add the new entry */
n = prev = next = NULL;
list_for_each_entry(e, &map->members, list) {
@@ -301,10 +298,11 @@ list_set_uadd(struct ip_set *set, void *value, const struct ip_set_ext *ext,
if (!e)
return -ENOMEM;
e->id = d->id;
+ e->set = set;
INIT_LIST_HEAD(&e->list);
list_set_init_extensions(set, ext, e);
if (n)
- list_set_replace(set, e, n);
+ list_set_replace(e, n);
else if (next)
list_add_tail_rcu(&e->list, &next->list);
else if (prev)
@@ -431,6 +429,7 @@ list_set_destroy(struct ip_set *set)
if (SET_WITH_TIMEOUT(set))
del_timer_sync(&map->gc);
+
list_for_each_entry_safe(e, n, &map->members, list) {
list_del(&e->list);
ip_set_put_byindex(map->net, e->id);
@@ -450,14 +449,16 @@ list_set_head(struct ip_set *set, struct sk_buff *skb)
struct set_elem *e;
u32 n = 0;
- list_for_each_entry(e, &map->members, list)
+ rcu_read_lock();
+ list_for_each_entry_rcu(e, &map->members, list)
n++;
+ rcu_read_unlock();
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
if (!nested)
goto nla_put_failure;
if (nla_put_net32(skb, IPSET_ATTR_SIZE, htonl(map->size)) ||
- nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref - 1)) ||
+ nla_put_net32(skb, IPSET_ATTR_REFERENCES, htonl(set->ref)) ||
nla_put_net32(skb, IPSET_ATTR_MEMSIZE,
htonl(sizeof(*map) + n * set->dsize)))
goto nla_put_failure;
@@ -483,33 +484,25 @@ list_set_list(const struct ip_set *set,
atd = ipset_nest_start(skb, IPSET_ATTR_ADT);
if (!atd)
return -EMSGSIZE;
- list_for_each_entry(e, &map->members, list) {
- if (i == first)
- break;
- i++;
- }
rcu_read_lock();
- list_for_each_entry_from(e, &map->members, list) {
- i++;
- if (SET_WITH_TIMEOUT(set) &&
- ip_set_timeout_expired(ext_timeout(e, set)))
+ list_for_each_entry_rcu(e, &map->members, list) {
+ if (i < first ||
+ (SET_WITH_TIMEOUT(set) &&
+ ip_set_timeout_expired(ext_timeout(e, set)))) {
+ i++;
continue;
+ }
nested = ipset_nest_start(skb, IPSET_ATTR_DATA);
- if (!nested) {
- if (i == first) {
- nla_nest_cancel(skb, atd);
- ret = -EMSGSIZE;
- goto out;
- }
+ if (!nested)
goto nla_put_failure;
- }
if (nla_put_string(skb, IPSET_ATTR_NAME,
ip_set_name_byindex(map->net, e->id)))
goto nla_put_failure;
if (ip_set_put_extensions(skb, set, e, true))
goto nla_put_failure;
ipset_nest_end(skb, nested);
+ i++;
}
ipset_nest_end(skb, atd);
@@ -520,10 +513,12 @@ list_set_list(const struct ip_set *set,
nla_put_failure:
nla_nest_cancel(skb, nested);
if (unlikely(i == first)) {
+ nla_nest_cancel(skb, atd);
cb->args[IPSET_CB_ARG0] = 0;
ret = -EMSGSIZE;
+ } else {
+ cb->args[IPSET_CB_ARG0] = i;
}
- cb->args[IPSET_CB_ARG0] = i - 1;
ipset_nest_end(skb, atd);
out:
rcu_read_unlock();
@@ -618,7 +613,8 @@ list_set_create(struct net *net, struct ip_set *set, struct nlattr *tb[],
size = IP_SET_LIST_MIN_SIZE;
set->variant = &set_variant;
- set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem));
+ set->dsize = ip_set_elem_len(set, tb, sizeof(struct set_elem),
+ __alignof__(struct set_elem));
if (!init_list_set(net, set, size))
return -ENOMEM;
if (tb[IPSET_ATTR_TIMEOUT]) {
diff --git a/net/netfilter/ipvs/ip_vs_app.c b/net/netfilter/ipvs/ip_vs_app.c
index 0328f7250693..299edc6add5a 100644
--- a/net/netfilter/ipvs/ip_vs_app.c
+++ b/net/netfilter/ipvs/ip_vs_app.c
@@ -605,17 +605,13 @@ static const struct file_operations ip_vs_app_fops = {
int __net_init ip_vs_app_net_init(struct netns_ipvs *ipvs)
{
- struct net *net = ipvs->net;
-
INIT_LIST_HEAD(&ipvs->app_list);
- proc_create("ip_vs_app", 0, net->proc_net, &ip_vs_app_fops);
+ proc_create("ip_vs_app", 0, ipvs->net->proc_net, &ip_vs_app_fops);
return 0;
}
void __net_exit ip_vs_app_net_cleanup(struct netns_ipvs *ipvs)
{
- struct net *net = ipvs->net;
-
unregister_ip_vs_app(ipvs, NULL /* all */);
- remove_proc_entry("ip_vs_app", net->proc_net);
+ remove_proc_entry("ip_vs_app", ipvs->net->proc_net);
}
diff --git a/net/netfilter/ipvs/ip_vs_core.c b/net/netfilter/ipvs/ip_vs_core.c
index 1e24fff53e4b..b9a4082afa3a 100644
--- a/net/netfilter/ipvs/ip_vs_core.c
+++ b/net/netfilter/ipvs/ip_vs_core.c
@@ -1089,6 +1089,7 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
switch (cp->protocol) {
case IPPROTO_TCP:
return (cp->state == IP_VS_TCP_S_TIME_WAIT) ||
+ (cp->state == IP_VS_TCP_S_CLOSE) ||
((conn_reuse_mode & 2) &&
(cp->state == IP_VS_TCP_S_FIN_WAIT) &&
(cp->flags & IP_VS_CONN_F_NOOUTPUT));
@@ -1176,6 +1177,7 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
struct ip_vs_protocol *pp;
struct ip_vs_proto_data *pd;
struct ip_vs_conn *cp;
+ struct sock *sk;
EnterFunction(11);
@@ -1183,13 +1185,12 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
if (skb->ipvs_property)
return NF_ACCEPT;
+ sk = skb_to_full_sk(skb);
/* Bad... Do not break raw sockets */
- if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
- struct sock *sk = skb->sk;
- struct inet_sock *inet = inet_sk(skb->sk);
- if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
return NF_ACCEPT;
}
@@ -1681,6 +1682,7 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
struct ip_vs_conn *cp;
int ret, pkts;
int conn_reuse_mode;
+ struct sock *sk;
/* Already marked as IPVS request or reply? */
if (skb->ipvs_property)
@@ -1708,12 +1710,11 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
ip_vs_fill_iph_skb(af, skb, false, &iph);
/* Bad... Do not break raw sockets */
- if (unlikely(skb->sk != NULL && hooknum == NF_INET_LOCAL_OUT &&
+ sk = skb_to_full_sk(skb);
+ if (unlikely(sk && hooknum == NF_INET_LOCAL_OUT &&
af == AF_INET)) {
- struct sock *sk = skb->sk;
- struct inet_sock *inet = inet_sk(skb->sk);
- if (inet && sk->sk_family == PF_INET && inet->nodefrag)
+ if (sk->sk_family == PF_INET && inet_sk(sk)->nodefrag)
return NF_ACCEPT;
}
@@ -1757,15 +1758,34 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
cp = pp->conn_in_get(ipvs, af, skb, &iph);
conn_reuse_mode = sysctl_conn_reuse_mode(ipvs);
- if (conn_reuse_mode && !iph.fragoffs &&
- is_new_conn(skb, &iph) && cp &&
- ((unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
- unlikely(!atomic_read(&cp->dest->weight))) ||
- unlikely(is_new_conn_expected(cp, conn_reuse_mode)))) {
- if (!atomic_read(&cp->n_control))
- ip_vs_conn_expire_now(cp);
- __ip_vs_conn_put(cp);
- cp = NULL;
+ if (conn_reuse_mode && !iph.fragoffs && is_new_conn(skb, &iph) && cp) {
+ bool uses_ct = false, resched = false;
+
+ if (unlikely(sysctl_expire_nodest_conn(ipvs)) && cp->dest &&
+ unlikely(!atomic_read(&cp->dest->weight))) {
+ resched = true;
+ uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
+ } else if (is_new_conn_expected(cp, conn_reuse_mode)) {
+ uses_ct = ip_vs_conn_uses_conntrack(cp, skb);
+ if (!atomic_read(&cp->n_control)) {
+ resched = true;
+ } else {
+ /* Do not reschedule controlling connection
+ * that uses conntrack while it is still
+ * referenced by controlled connection(s).
+ */
+ resched = !uses_ct;
+ }
+ }
+
+ if (resched) {
+ if (!atomic_read(&cp->n_control))
+ ip_vs_conn_expire_now(cp);
+ __ip_vs_conn_put(cp);
+ if (uses_ct)
+ return NF_DROP;
+ cp = NULL;
+ }
}
if (unlikely(!cp)) {
diff --git a/net/netfilter/ipvs/ip_vs_ctl.c b/net/netfilter/ipvs/ip_vs_ctl.c
index e7c1b052c2a3..404b2a4f4b5b 100644
--- a/net/netfilter/ipvs/ip_vs_ctl.c
+++ b/net/netfilter/ipvs/ip_vs_ctl.c
@@ -1376,8 +1376,6 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
struct ip_vs_pe *old_pe;
struct netns_ipvs *ipvs = svc->ipvs;
- pr_info("%s: enter\n", __func__);
-
/* Count only IPv4 services for old get/setsockopt interface */
if (svc->af == AF_INET)
ipvs->num_services--;
@@ -3947,7 +3945,6 @@ static struct notifier_block ip_vs_dst_notifier = {
int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
{
- struct net *net = ipvs->net;
int i, idx;
/* Initialize rs_table */
@@ -3974,9 +3971,9 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
spin_lock_init(&ipvs->tot_stats.lock);
- proc_create("ip_vs", 0, net->proc_net, &ip_vs_info_fops);
- proc_create("ip_vs_stats", 0, net->proc_net, &ip_vs_stats_fops);
- proc_create("ip_vs_stats_percpu", 0, net->proc_net,
+ proc_create("ip_vs", 0, ipvs->net->proc_net, &ip_vs_info_fops);
+ proc_create("ip_vs_stats", 0, ipvs->net->proc_net, &ip_vs_stats_fops);
+ proc_create("ip_vs_stats_percpu", 0, ipvs->net->proc_net,
&ip_vs_stats_percpu_fops);
if (ip_vs_control_net_init_sysctl(ipvs))
@@ -3991,13 +3988,11 @@ err:
void __net_exit ip_vs_control_net_cleanup(struct netns_ipvs *ipvs)
{
- struct net *net = ipvs->net;
-
ip_vs_trash_cleanup(ipvs);
ip_vs_control_net_cleanup_sysctl(ipvs);
- remove_proc_entry("ip_vs_stats_percpu", net->proc_net);
- remove_proc_entry("ip_vs_stats", net->proc_net);
- remove_proc_entry("ip_vs", net->proc_net);
+ remove_proc_entry("ip_vs_stats_percpu", ipvs->net->proc_net);
+ remove_proc_entry("ip_vs_stats", ipvs->net->proc_net);
+ remove_proc_entry("ip_vs", ipvs->net->proc_net);
free_percpu(ipvs->tot_stats.cpustats);
}
diff --git a/net/netfilter/ipvs/ip_vs_pe_sip.c b/net/netfilter/ipvs/ip_vs_pe_sip.c
index 1b8d594e493a..0a6eb5c0d9e9 100644
--- a/net/netfilter/ipvs/ip_vs_pe_sip.c
+++ b/net/netfilter/ipvs/ip_vs_pe_sip.c
@@ -70,10 +70,10 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
const char *dptr;
int retc;
- ip_vs_fill_iph_skb(p->af, skb, false, &iph);
+ retc = ip_vs_fill_iph_skb(p->af, skb, false, &iph);
/* Only useful with UDP */
- if (iph.protocol != IPPROTO_UDP)
+ if (!retc || iph.protocol != IPPROTO_UDP)
return -EINVAL;
/* todo: IPv6 fragments:
* I think this only should be done for the first fragment. /HS
@@ -88,7 +88,7 @@ ip_vs_sip_fill_param(struct ip_vs_conn_param *p, struct sk_buff *skb)
dptr = skb->data + dataoff;
datalen = skb->len - dataoff;
- if (get_callid(dptr, dataoff, datalen, &matchoff, &matchlen))
+ if (get_callid(dptr, 0, datalen, &matchoff, &matchlen))
return -EINVAL;
/* N.B: pe_data is only set on success,
diff --git a/net/netfilter/ipvs/ip_vs_proto_sctp.c b/net/netfilter/ipvs/ip_vs_proto_sctp.c
index 010ddeec135f..d952d67f904d 100644
--- a/net/netfilter/ipvs/ip_vs_proto_sctp.c
+++ b/net/netfilter/ipvs/ip_vs_proto_sctp.c
@@ -169,7 +169,7 @@ sctp_dnat_handler(struct sk_buff *skb, struct ip_vs_protocol *pp,
/* Only update csum if we really have to */
if (sctph->dest != cp->dport || payload_csum ||
(skb->ip_summed == CHECKSUM_PARTIAL &&
- !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CSUM))) {
+ !(skb_dst(skb)->dev->features & NETIF_F_SCTP_CRC))) {
sctph->dest = cp->dport;
sctp_nat_csum(skb, sctph, sctphoff);
} else if (skb->ip_summed != CHECKSUM_PARTIAL) {
diff --git a/net/netfilter/ipvs/ip_vs_xmit.c b/net/netfilter/ipvs/ip_vs_xmit.c
index 3264cb49b333..dc196a0f501d 100644
--- a/net/netfilter/ipvs/ip_vs_xmit.c
+++ b/net/netfilter/ipvs/ip_vs_xmit.c
@@ -531,8 +531,6 @@ static inline int ip_vs_tunnel_xmit_prepare(struct sk_buff *skb,
if (ret == NF_ACCEPT) {
nf_reset(skb);
skb_forward_csum(skb);
- if (!skb->sk)
- skb_sender_cpu_clear(skb);
}
return ret;
}
@@ -573,8 +571,6 @@ static inline int ip_vs_nat_send_or_cont(int pf, struct sk_buff *skb,
if (!local) {
skb_forward_csum(skb);
- if (!skb->sk)
- skb_sender_cpu_clear(skb);
NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
NULL, skb_dst(skb)->dev, dst_output);
} else
@@ -595,8 +591,6 @@ static inline int ip_vs_send_or_cont(int pf, struct sk_buff *skb,
if (!local) {
ip_vs_drop_early_demux_sk(skb);
skb_forward_csum(skb);
- if (!skb->sk)
- skb_sender_cpu_clear(skb);
NF_HOOK(pf, NF_INET_LOCAL_OUT, cp->ipvs->net, NULL, skb,
NULL, skb_dst(skb)->dev, dst_output);
} else
@@ -1019,8 +1013,7 @@ ip_vs_tunnel_xmit(struct sk_buff *skb, struct ip_vs_conn *cp,
if (IS_ERR(skb))
goto tx_error;
- skb = iptunnel_handle_offloads(
- skb, false, __tun_gso_type_mask(AF_INET, cp->af));
+ skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET, cp->af));
if (IS_ERR(skb))
goto tx_error;
@@ -1112,8 +1105,7 @@ ip_vs_tunnel_xmit_v6(struct sk_buff *skb, struct ip_vs_conn *cp,
if (IS_ERR(skb))
goto tx_error;
- skb = iptunnel_handle_offloads(
- skb, false, __tun_gso_type_mask(AF_INET6, cp->af));
+ skb = iptunnel_handle_offloads(skb, __tun_gso_type_mask(AF_INET6, cp->af));
if (IS_ERR(skb))
goto tx_error;
diff --git a/net/netfilter/nf_conntrack_core.c b/net/netfilter/nf_conntrack_core.c
index 3cb3cb831591..afde5f5e728a 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -66,6 +66,20 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks);
__cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
+static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
+static __read_mostly bool nf_conntrack_locks_all;
+
+void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
+{
+ spin_lock(lock);
+ while (unlikely(nf_conntrack_locks_all)) {
+ spin_unlock(lock);
+ spin_unlock_wait(&nf_conntrack_locks_all_lock);
+ spin_lock(lock);
+ }
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_lock);
+
static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
{
h1 %= CONNTRACK_LOCKS;
@@ -82,12 +96,12 @@ static bool nf_conntrack_double_lock(struct net *net, unsigned int h1,
h1 %= CONNTRACK_LOCKS;
h2 %= CONNTRACK_LOCKS;
if (h1 <= h2) {
- spin_lock(&nf_conntrack_locks[h1]);
+ nf_conntrack_lock(&nf_conntrack_locks[h1]);
if (h1 != h2)
spin_lock_nested(&nf_conntrack_locks[h2],
SINGLE_DEPTH_NESTING);
} else {
- spin_lock(&nf_conntrack_locks[h2]);
+ nf_conntrack_lock(&nf_conntrack_locks[h2]);
spin_lock_nested(&nf_conntrack_locks[h1],
SINGLE_DEPTH_NESTING);
}
@@ -102,16 +116,18 @@ static void nf_conntrack_all_lock(void)
{
int i;
- for (i = 0; i < CONNTRACK_LOCKS; i++)
- spin_lock_nested(&nf_conntrack_locks[i], i);
+ spin_lock(&nf_conntrack_locks_all_lock);
+ nf_conntrack_locks_all = true;
+
+ for (i = 0; i < CONNTRACK_LOCKS; i++) {
+ spin_unlock_wait(&nf_conntrack_locks[i]);
+ }
}
static void nf_conntrack_all_unlock(void)
{
- int i;
-
- for (i = 0; i < CONNTRACK_LOCKS; i++)
- spin_unlock(&nf_conntrack_locks[i]);
+ nf_conntrack_locks_all = false;
+ spin_unlock(&nf_conntrack_locks_all_lock);
}
unsigned int nf_conntrack_htable_size __read_mostly;
@@ -757,7 +773,7 @@ restart:
hash = hash_bucket(_hash, net);
for (; i < net->ct.htable_size; i++) {
lockp = &nf_conntrack_locks[hash % CONNTRACK_LOCKS];
- spin_lock(lockp);
+ nf_conntrack_lock(lockp);
if (read_seqcount_retry(&net->ct.generation, sequence)) {
spin_unlock(lockp);
goto restart;
@@ -1382,7 +1398,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
for (; *bucket < net->ct.htable_size; (*bucket)++) {
lockp = &nf_conntrack_locks[*bucket % CONNTRACK_LOCKS];
local_bh_disable();
- spin_lock(lockp);
+ nf_conntrack_lock(lockp);
if (*bucket < net->ct.htable_size) {
hlist_nulls_for_each_entry(h, n, &net->ct.hash[*bucket], hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
@@ -1394,6 +1410,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
}
spin_unlock(lockp);
local_bh_enable();
+ cond_resched();
}
for_each_possible_cpu(cpu) {
@@ -1406,6 +1423,7 @@ get_next_corpse(struct net *net, int (*iter)(struct nf_conn *i, void *data),
set_bit(IPS_DYING_BIT, &ct->status);
}
spin_unlock_bh(&pcpu->lock);
+ cond_resched();
}
return NULL;
found:
@@ -1422,6 +1440,8 @@ void nf_ct_iterate_cleanup(struct net *net,
struct nf_conn *ct;
unsigned int bucket = 0;
+ might_sleep();
+
while ((ct = get_next_corpse(net, iter, data, &bucket)) != NULL) {
/* Time to push up daises... */
if (del_timer(&ct->timeout))
@@ -1430,6 +1450,7 @@ void nf_ct_iterate_cleanup(struct net *net,
/* ... else the timer will get him soon. */
nf_ct_put(ct);
+ cond_resched();
}
}
EXPORT_SYMBOL_GPL(nf_ct_iterate_cleanup);
diff --git a/net/netfilter/nf_conntrack_expect.c b/net/netfilter/nf_conntrack_expect.c
index acf5c7b3f378..278927ab0948 100644
--- a/net/netfilter/nf_conntrack_expect.c
+++ b/net/netfilter/nf_conntrack_expect.c
@@ -596,11 +596,18 @@ static int exp_proc_init(struct net *net)
{
#ifdef CONFIG_NF_CONNTRACK_PROCFS
struct proc_dir_entry *proc;
+ kuid_t root_uid;
+ kgid_t root_gid;
proc = proc_create("nf_conntrack_expect", 0440, net->proc_net,
&exp_file_ops);
if (!proc)
return -ENOMEM;
+
+ root_uid = make_kuid(net->user_ns, 0);
+ root_gid = make_kgid(net->user_ns, 0);
+ if (uid_valid(root_uid) && gid_valid(root_gid))
+ proc_set_user(proc, root_uid, root_gid);
#endif /* CONFIG_NF_CONNTRACK_PROCFS */
return 0;
}
diff --git a/net/netfilter/nf_conntrack_ftp.c b/net/netfilter/nf_conntrack_ftp.c
index b666959f17c0..883c691ec8d0 100644
--- a/net/netfilter/nf_conntrack_ftp.c
+++ b/net/netfilter/nf_conntrack_ftp.c
@@ -10,6 +10,8 @@
* published by the Free Software Foundation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/netfilter.h>
@@ -505,11 +507,11 @@ skip_nl_seq:
different IP address. Simply don't record it for
NAT. */
if (cmd.l3num == PF_INET) {
- pr_debug("conntrack_ftp: NOT RECORDING: %pI4 != %pI4\n",
+ pr_debug("NOT RECORDING: %pI4 != %pI4\n",
&cmd.u3.ip,
&ct->tuplehash[dir].tuple.src.u3.ip);
} else {
- pr_debug("conntrack_ftp: NOT RECORDING: %pI6 != %pI6\n",
+ pr_debug("NOT RECORDING: %pI6 != %pI6\n",
cmd.u3.ip6,
ct->tuplehash[dir].tuple.src.u3.ip6);
}
@@ -586,8 +588,7 @@ static void nf_conntrack_ftp_fini(void)
if (ftp[i][j].me == NULL)
continue;
- pr_debug("nf_ct_ftp: unregistering helper for pf: %d "
- "port: %d\n",
+ pr_debug("unregistering helper for pf: %d port: %d\n",
ftp[i][j].tuple.src.l3num, ports[i]);
nf_conntrack_helper_unregister(&ftp[i][j]);
}
@@ -625,14 +626,12 @@ static int __init nf_conntrack_ftp_init(void)
else
sprintf(ftp[i][j].name, "ftp-%d", ports[i]);
- pr_debug("nf_ct_ftp: registering helper for pf: %d "
- "port: %d\n",
+ pr_debug("registering helper for pf: %d port: %d\n",
ftp[i][j].tuple.src.l3num, ports[i]);
ret = nf_conntrack_helper_register(&ftp[i][j]);
if (ret) {
- printk(KERN_ERR "nf_ct_ftp: failed to register"
- " helper for pf: %d port: %d\n",
- ftp[i][j].tuple.src.l3num, ports[i]);
+ pr_err("failed to register helper for pf: %d port: %d\n",
+ ftp[i][j].tuple.src.l3num, ports[i]);
nf_conntrack_ftp_fini();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_helper.c b/net/netfilter/nf_conntrack_helper.c
index bd9d31537905..3b40ec575cd5 100644
--- a/net/netfilter/nf_conntrack_helper.c
+++ b/net/netfilter/nf_conntrack_helper.c
@@ -425,7 +425,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
}
local_bh_disable();
for (i = 0; i < net->ct.htable_size; i++) {
- spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+ nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
if (i < net->ct.htable_size) {
hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
unhelp(h, me);
diff --git a/net/netfilter/nf_conntrack_irc.c b/net/netfilter/nf_conntrack_irc.c
index 0fd2976db7ee..8b6da2719600 100644
--- a/net/netfilter/nf_conntrack_irc.c
+++ b/net/netfilter/nf_conntrack_irc.c
@@ -9,6 +9,8 @@
* 2 of the License, or (at your option) any later version.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/skbuff.h>
@@ -237,7 +239,7 @@ static int __init nf_conntrack_irc_init(void)
int i, ret;
if (max_dcc_channels < 1) {
- printk(KERN_ERR "nf_ct_irc: max_dcc_channels must not be zero\n");
+ pr_err("max_dcc_channels must not be zero\n");
return -EINVAL;
}
@@ -267,8 +269,7 @@ static int __init nf_conntrack_irc_init(void)
ret = nf_conntrack_helper_register(&irc[i]);
if (ret) {
- printk(KERN_ERR "nf_ct_irc: failed to register helper "
- "for pf: %u port: %u\n",
+ pr_err("failed to register helper for pf: %u port: %u\n",
irc[i].tuple.src.l3num, ports[i]);
nf_conntrack_irc_fini();
return ret;
diff --git a/net/netfilter/nf_conntrack_netlink.c b/net/netfilter/nf_conntrack_netlink.c
index 9f5272968abb..355e8552fd5b 100644
--- a/net/netfilter/nf_conntrack_netlink.c
+++ b/net/netfilter/nf_conntrack_netlink.c
@@ -840,7 +840,7 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
restart:
lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
- spin_lock(lockp);
+ nf_conntrack_lock(lockp);
if (cb->args[0] >= net->ct.htable_size) {
spin_unlock(lockp);
goto out;
@@ -1113,12 +1113,11 @@ static int ctnetlink_flush_conntrack(struct net *net,
return 0;
}
-static int
-ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[])
+static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
- struct net *net = sock_net(ctnl);
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
struct nf_conn *ct;
@@ -1168,12 +1167,11 @@ ctnetlink_del_conntrack(struct sock *ctnl, struct sk_buff *skb,
return 0;
}
-static int
-ctnetlink_get_conntrack(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[])
+static int ctnetlink_get_conntrack(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
- struct net *net = sock_net(ctnl);
struct nf_conntrack_tuple_hash *h;
struct nf_conntrack_tuple tuple;
struct nf_conn *ct;
@@ -1330,10 +1328,10 @@ ctnetlink_dump_dying(struct sk_buff *skb, struct netlink_callback *cb)
return ctnetlink_dump_list(skb, cb, true);
}
-static int
-ctnetlink_get_ct_dying(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[])
+static int ctnetlink_get_ct_dying(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
@@ -1352,10 +1350,10 @@ ctnetlink_dump_unconfirmed(struct sk_buff *skb, struct netlink_callback *cb)
return ctnetlink_dump_list(skb, cb, false);
}
-static int
-ctnetlink_get_ct_unconfirmed(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[])
+static int ctnetlink_get_ct_unconfirmed(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
@@ -1865,12 +1863,11 @@ err1:
return ERR_PTR(err);
}
-static int
-ctnetlink_new_conntrack(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[])
+static int ctnetlink_new_conntrack(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
- struct net *net = sock_net(ctnl);
struct nf_conntrack_tuple otuple, rtuple;
struct nf_conntrack_tuple_hash *h = NULL;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -2034,10 +2031,10 @@ ctnetlink_ct_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-static int
-ctnetlink_stat_ct_cpu(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[])
+static int ctnetlink_stat_ct_cpu(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
@@ -2080,10 +2077,9 @@ nlmsg_failure:
return -1;
}
-static int
-ctnetlink_stat_ct(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[])
+static int ctnetlink_stat_ct(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
struct sk_buff *skb2;
int err;
@@ -2729,12 +2725,12 @@ out:
return skb->len;
}
-static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
+static int ctnetlink_dump_exp_ct(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
int err;
- struct net *net = sock_net(ctnl);
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int8_t u3 = nfmsg->nfgen_family;
struct nf_conntrack_tuple tuple;
@@ -2768,12 +2764,10 @@ static int ctnetlink_dump_exp_ct(struct sock *ctnl, struct sk_buff *skb,
return err;
}
-static int
-ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[])
+static int ctnetlink_get_expect(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
- struct net *net = sock_net(ctnl);
struct nf_conntrack_tuple tuple;
struct nf_conntrack_expect *exp;
struct sk_buff *skb2;
@@ -2784,7 +2778,7 @@ ctnetlink_get_expect(struct sock *ctnl, struct sk_buff *skb,
if (nlh->nlmsg_flags & NLM_F_DUMP) {
if (cda[CTA_EXPECT_MASTER])
- return ctnetlink_dump_exp_ct(ctnl, skb, nlh, cda);
+ return ctnetlink_dump_exp_ct(net, ctnl, skb, nlh, cda);
else {
struct netlink_dump_control c = {
.dump = ctnetlink_exp_dump_table,
@@ -2850,12 +2844,10 @@ out:
return err == -EAGAIN ? -ENOBUFS : err;
}
-static int
-ctnetlink_del_expect(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[])
+static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
- struct net *net = sock_net(ctnl);
struct nf_conntrack_expect *exp;
struct nf_conntrack_tuple tuple;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -3136,12 +3128,10 @@ err_ct:
return err;
}
-static int
-ctnetlink_new_expect(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[])
+static int ctnetlink_new_expect(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
- struct net *net = sock_net(ctnl);
struct nf_conntrack_tuple tuple;
struct nf_conntrack_expect *exp;
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -3242,10 +3232,10 @@ ctnetlink_exp_stat_cpu_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-static int
-ctnetlink_stat_exp_cpu(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[])
+static int ctnetlink_stat_exp_cpu(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
if (nlh->nlmsg_flags & NLM_F_DUMP) {
struct netlink_dump_control c = {
diff --git a/net/netfilter/nf_conntrack_sane.c b/net/netfilter/nf_conntrack_sane.c
index 4a2134fd3fcb..7523a575f6d1 100644
--- a/net/netfilter/nf_conntrack_sane.c
+++ b/net/netfilter/nf_conntrack_sane.c
@@ -17,6 +17,8 @@
* published by the Free Software Foundation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/netfilter.h>
@@ -120,14 +122,14 @@ static int help(struct sk_buff *skb,
ct_sane_info->state = SANE_STATE_NORMAL;
if (datalen < sizeof(struct sane_reply_net_start)) {
- pr_debug("nf_ct_sane: NET_START reply too short\n");
+ pr_debug("NET_START reply too short\n");
goto out;
}
reply = sb_ptr;
if (reply->status != htonl(SANE_STATUS_SUCCESS)) {
/* saned refused the command */
- pr_debug("nf_ct_sane: unsuccessful SANE_STATUS = %u\n",
+ pr_debug("unsuccessful SANE_STATUS = %u\n",
ntohl(reply->status));
goto out;
}
@@ -148,7 +150,7 @@ static int help(struct sk_buff *skb,
&tuple->src.u3, &tuple->dst.u3,
IPPROTO_TCP, NULL, &reply->port);
- pr_debug("nf_ct_sane: expect: ");
+ pr_debug("expect: ");
nf_ct_dump_tuple(&exp->tuple);
/* Can't expect this? Best to drop packet now. */
@@ -178,8 +180,7 @@ static void nf_conntrack_sane_fini(void)
for (i = 0; i < ports_c; i++) {
for (j = 0; j < 2; j++) {
- pr_debug("nf_ct_sane: unregistering helper for pf: %d "
- "port: %d\n",
+ pr_debug("unregistering helper for pf: %d port: %d\n",
sane[i][j].tuple.src.l3num, ports[i]);
nf_conntrack_helper_unregister(&sane[i][j]);
}
@@ -216,14 +217,12 @@ static int __init nf_conntrack_sane_init(void)
else
sprintf(sane[i][j].name, "sane-%d", ports[i]);
- pr_debug("nf_ct_sane: registering helper for pf: %d "
- "port: %d\n",
+ pr_debug("registering helper for pf: %d port: %d\n",
sane[i][j].tuple.src.l3num, ports[i]);
ret = nf_conntrack_helper_register(&sane[i][j]);
if (ret) {
- printk(KERN_ERR "nf_ct_sane: failed to "
- "register helper for pf: %d port: %d\n",
- sane[i][j].tuple.src.l3num, ports[i]);
+ pr_err("failed to register helper for pf: %d port: %d\n",
+ sane[i][j].tuple.src.l3num, ports[i]);
nf_conntrack_sane_fini();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_sip.c b/net/netfilter/nf_conntrack_sip.c
index 885b4aba3695..3e06402739e0 100644
--- a/net/netfilter/nf_conntrack_sip.c
+++ b/net/netfilter/nf_conntrack_sip.c
@@ -10,6 +10,8 @@
* published by the Free Software Foundation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/ctype.h>
#include <linux/skbuff.h>
@@ -1665,8 +1667,7 @@ static int __init nf_conntrack_sip_init(void)
ret = nf_conntrack_helper_register(&sip[i][j]);
if (ret) {
- printk(KERN_ERR "nf_ct_sip: failed to register"
- " helper for pf: %u port: %u\n",
+ pr_err("failed to register helper for pf: %u port: %u\n",
sip[i][j].tuple.src.l3num, ports[i]);
nf_conntrack_sip_fini();
return ret;
diff --git a/net/netfilter/nf_conntrack_standalone.c b/net/netfilter/nf_conntrack_standalone.c
index 1fb3cacc04e1..0f1a45bcacb2 100644
--- a/net/netfilter/nf_conntrack_standalone.c
+++ b/net/netfilter/nf_conntrack_standalone.c
@@ -392,11 +392,18 @@ static const struct file_operations ct_cpu_seq_fops = {
static int nf_conntrack_standalone_init_proc(struct net *net)
{
struct proc_dir_entry *pde;
+ kuid_t root_uid;
+ kgid_t root_gid;
pde = proc_create("nf_conntrack", 0440, net->proc_net, &ct_file_ops);
if (!pde)
goto out_nf_conntrack;
+ root_uid = make_kuid(net->user_ns, 0);
+ root_gid = make_kgid(net->user_ns, 0);
+ if (uid_valid(root_uid) && gid_valid(root_gid))
+ proc_set_user(pde, root_uid, root_gid);
+
pde = proc_create("nf_conntrack", S_IRUGO, net->proc_net_stat,
&ct_cpu_seq_fops);
if (!pde)
diff --git a/net/netfilter/nf_conntrack_tftp.c b/net/netfilter/nf_conntrack_tftp.c
index e68ab4fbd71f..36f964066461 100644
--- a/net/netfilter/nf_conntrack_tftp.c
+++ b/net/netfilter/nf_conntrack_tftp.c
@@ -5,6 +5,8 @@
* published by the Free Software Foundation.
*/
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
#include <linux/module.h>
#include <linux/moduleparam.h>
#include <linux/in.h>
@@ -138,9 +140,8 @@ static int __init nf_conntrack_tftp_init(void)
ret = nf_conntrack_helper_register(&tftp[i][j]);
if (ret) {
- printk(KERN_ERR "nf_ct_tftp: failed to register"
- " helper for pf: %u port: %u\n",
- tftp[i][j].tuple.src.l3num, ports[i]);
+ pr_err("failed to register helper for pf: %u port: %u\n",
+ tftp[i][j].tuple.src.l3num, ports[i]);
nf_conntrack_tftp_fini();
return ret;
}
diff --git a/net/netfilter/nf_conntrack_timeout.c b/net/netfilter/nf_conntrack_timeout.c
index 93da609d9d29..26e742006c48 100644
--- a/net/netfilter/nf_conntrack_timeout.c
+++ b/net/netfilter/nf_conntrack_timeout.c
@@ -25,7 +25,7 @@
#include <net/netfilter/nf_conntrack_timeout.h>
struct ctnl_timeout *
-(*nf_ct_timeout_find_get_hook)(const char *name) __read_mostly;
+(*nf_ct_timeout_find_get_hook)(struct net *net, const char *name) __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_timeout_find_get_hook);
void (*nf_ct_timeout_put_hook)(struct ctnl_timeout *timeout) __read_mostly;
diff --git a/net/netfilter/nf_dup_netdev.c b/net/netfilter/nf_dup_netdev.c
new file mode 100644
index 000000000000..7ec69723940f
--- /dev/null
+++ b/net/netfilter/nf_dup_netdev.c
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+
+void nf_dup_netdev_egress(const struct nft_pktinfo *pkt, int oif)
+{
+ struct net_device *dev;
+ struct sk_buff *skb;
+
+ dev = dev_get_by_index_rcu(pkt->net, oif);
+ if (dev == NULL)
+ return;
+
+ skb = skb_clone(pkt->skb, GFP_ATOMIC);
+ if (skb == NULL)
+ return;
+
+ if (skb_mac_header_was_set(skb))
+ skb_push(skb, skb->mac_len);
+
+ skb->dev = dev;
+ dev_queue_xmit(skb);
+}
+EXPORT_SYMBOL_GPL(nf_dup_netdev_egress);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
index 97b75f9bfbcd..d43869879fcf 100644
--- a/net/netfilter/nf_nat_redirect.c
+++ b/net/netfilter/nf_nat_redirect.c
@@ -55,7 +55,7 @@ nf_nat_redirect_ipv4(struct sk_buff *skb,
rcu_read_lock();
indev = __in_dev_get_rcu(skb->dev);
- if (indev != NULL) {
+ if (indev && indev->ifa_list) {
ifa = indev->ifa_list;
newdst = ifa->ifa_local;
}
diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
index 93cc4737018f..2011977cd79d 100644
--- a/net/netfilter/nf_tables_api.c
+++ b/net/netfilter/nf_tables_api.c
@@ -41,6 +41,8 @@ int nft_register_afinfo(struct net *net, struct nft_af_info *afi)
}
EXPORT_SYMBOL_GPL(nft_register_afinfo);
+static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi);
+
/**
* nft_unregister_afinfo - unregister nf_tables address family info
*
@@ -48,9 +50,10 @@ EXPORT_SYMBOL_GPL(nft_register_afinfo);
*
* Unregister the address family for use with nf_tables.
*/
-void nft_unregister_afinfo(struct nft_af_info *afi)
+void nft_unregister_afinfo(struct net *net, struct nft_af_info *afi)
{
nfnl_lock(NFNL_SUBSYS_NFTABLES);
+ __nft_release_afinfo(net, afi);
list_del_rcu(&afi->list);
nfnl_unlock(NFNL_SUBSYS_NFTABLES);
}
@@ -89,6 +92,7 @@ nf_tables_afinfo_lookup(struct net *net, int family, bool autoload)
}
static void nft_ctx_init(struct nft_ctx *ctx,
+ struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
struct nft_af_info *afi,
@@ -96,7 +100,7 @@ static void nft_ctx_init(struct nft_ctx *ctx,
struct nft_chain *chain,
const struct nlattr * const *nla)
{
- ctx->net = sock_net(skb->sk);
+ ctx->net = net;
ctx->afi = afi;
ctx->table = table;
ctx->chain = chain;
@@ -127,8 +131,8 @@ static void nft_trans_destroy(struct nft_trans *trans)
kfree(trans);
}
-int nft_register_basechain(struct nft_base_chain *basechain,
- unsigned int hook_nops)
+static int nft_register_basechain(struct nft_base_chain *basechain,
+ unsigned int hook_nops)
{
struct net *net = read_pnet(&basechain->pnet);
@@ -137,10 +141,9 @@ int nft_register_basechain(struct nft_base_chain *basechain,
return nf_register_net_hooks(net, basechain->ops, hook_nops);
}
-EXPORT_SYMBOL_GPL(nft_register_basechain);
-void nft_unregister_basechain(struct nft_base_chain *basechain,
- unsigned int hook_nops)
+static void nft_unregister_basechain(struct nft_base_chain *basechain,
+ unsigned int hook_nops)
{
struct net *net = read_pnet(&basechain->pnet);
@@ -149,7 +152,6 @@ void nft_unregister_basechain(struct nft_base_chain *basechain,
nf_unregister_net_hooks(net, basechain->ops, hook_nops);
}
-EXPORT_SYMBOL_GPL(nft_unregister_basechain);
static int nf_tables_register_hooks(const struct nft_table *table,
struct nft_chain *chain,
@@ -541,15 +543,14 @@ done:
return skb->len;
}
-static int nf_tables_gettable(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_gettable(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
const struct nft_af_info *afi;
const struct nft_table *table;
struct sk_buff *skb2;
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
int err;
@@ -672,15 +673,14 @@ err:
return ret;
}
-static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newtable(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
const struct nlattr *name;
struct nft_af_info *afi;
struct nft_table *table;
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
u32 flags = 0;
struct nft_ctx ctx;
@@ -706,7 +706,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
if (nlh->nlmsg_flags & NLM_F_REPLACE)
return -EOPNOTSUPP;
- nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
return nf_tables_updtable(&ctx);
}
@@ -730,7 +730,7 @@ static int nf_tables_newtable(struct sock *nlsk, struct sk_buff *skb,
INIT_LIST_HEAD(&table->sets);
table->flags = flags;
- nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
err = nft_trans_table_add(&ctx, NFT_MSG_NEWTABLE);
if (err < 0)
goto err3;
@@ -810,18 +810,17 @@ out:
return err;
}
-static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_deltable(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
struct nft_table *table;
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nft_ctx ctx;
- nft_ctx_init(&ctx, skb, nlh, NULL, NULL, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, NULL, NULL, NULL, nla);
if (family == AF_UNSPEC || nla[NFTA_TABLE_NAME] == NULL)
return nft_flush(&ctx, family);
@@ -832,8 +831,6 @@ static int nf_tables_deltable(struct sock *nlsk, struct sk_buff *skb,
table = nf_tables_table_lookup(afi, nla[NFTA_TABLE_NAME]);
if (IS_ERR(table))
return PTR_ERR(table);
- if (table->flags & NFT_TABLE_INACTIVE)
- return -ENOENT;
ctx.afi = afi;
ctx.table = table;
@@ -1099,8 +1096,8 @@ done:
return skb->len;
}
-static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_getchain(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -1108,7 +1105,6 @@ static int nf_tables_getchain(struct sock *nlsk, struct sk_buff *skb,
const struct nft_table *table;
const struct nft_chain *chain;
struct sk_buff *skb2;
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
int err;
@@ -1221,8 +1217,8 @@ static void nf_tables_chain_destroy(struct nft_chain *chain)
}
}
-static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newchain(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -1232,7 +1228,6 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
struct nft_chain *chain;
struct nft_base_chain *basechain = NULL;
struct nlattr *ha[NFTA_HOOK_MAX + 1];
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct net_device *dev = NULL;
u8 policy = NF_ACCEPT;
@@ -1313,7 +1308,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
return PTR_ERR(stats);
}
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
trans = nft_trans_alloc(&ctx, NFT_MSG_NEWCHAIN,
sizeof(struct nft_trans_chain));
if (trans == NULL) {
@@ -1461,7 +1456,7 @@ static int nf_tables_newchain(struct sock *nlsk, struct sk_buff *skb,
if (err < 0)
goto err1;
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
err = nft_trans_chain_add(&ctx, NFT_MSG_NEWCHAIN);
if (err < 0)
goto err2;
@@ -1476,15 +1471,14 @@ err1:
return err;
}
-static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delchain(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
struct nft_table *table;
struct nft_chain *chain;
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
struct nft_ctx ctx;
@@ -1495,18 +1489,14 @@ static int nf_tables_delchain(struct sock *nlsk, struct sk_buff *skb,
table = nf_tables_table_lookup(afi, nla[NFTA_CHAIN_TABLE]);
if (IS_ERR(table))
return PTR_ERR(table);
- if (table->flags & NFT_TABLE_INACTIVE)
- return -ENOENT;
chain = nf_tables_chain_lookup(table, nla[NFTA_CHAIN_NAME]);
if (IS_ERR(chain))
return PTR_ERR(chain);
- if (chain->flags & NFT_CHAIN_INACTIVE)
- return -ENOENT;
if (chain->use > 0)
return -EBUSY;
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
return nft_delchain(&ctx);
}
@@ -1931,8 +1921,8 @@ done:
return skb->len;
}
-static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_getrule(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -1941,7 +1931,6 @@ static int nf_tables_getrule(struct sock *nlsk, struct sk_buff *skb,
const struct nft_chain *chain;
const struct nft_rule *rule;
struct sk_buff *skb2;
- struct net *net = sock_net(skb->sk);
int family = nfmsg->nfgen_family;
int err;
@@ -2010,13 +1999,12 @@ static void nf_tables_rule_destroy(const struct nft_ctx *ctx,
static struct nft_expr_info *info;
-static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newrule(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
- struct net *net = sock_net(skb->sk);
struct nft_table *table;
struct nft_chain *chain;
struct nft_rule *rule, *old_rule = NULL;
@@ -2075,7 +2063,7 @@ static int nf_tables_newrule(struct sock *nlsk, struct sk_buff *skb,
return PTR_ERR(old_rule);
}
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
n = 0;
size = 0;
@@ -2176,13 +2164,12 @@ err1:
return err;
}
-static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delrule(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
- struct net *net = sock_net(skb->sk);
struct nft_table *table;
struct nft_chain *chain = NULL;
struct nft_rule *rule;
@@ -2196,8 +2183,6 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
table = nf_tables_table_lookup(afi, nla[NFTA_RULE_TABLE]);
if (IS_ERR(table))
return PTR_ERR(table);
- if (table->flags & NFT_TABLE_INACTIVE)
- return -ENOENT;
if (nla[NFTA_RULE_CHAIN]) {
chain = nf_tables_chain_lookup(table, nla[NFTA_RULE_CHAIN]);
@@ -2205,7 +2190,7 @@ static int nf_tables_delrule(struct sock *nlsk, struct sk_buff *skb,
return PTR_ERR(chain);
}
- nft_ctx_init(&ctx, skb, nlh, afi, table, chain, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, chain, nla);
if (chain) {
if (nla[NFTA_RULE_HANDLE]) {
@@ -2338,18 +2323,19 @@ static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
[NFTA_SET_ID] = { .type = NLA_U32 },
[NFTA_SET_TIMEOUT] = { .type = NLA_U64 },
[NFTA_SET_GC_INTERVAL] = { .type = NLA_U32 },
+ [NFTA_SET_USERDATA] = { .type = NLA_BINARY,
+ .len = NFT_USERDATA_MAXLEN },
};
static const struct nla_policy nft_set_desc_policy[NFTA_SET_DESC_MAX + 1] = {
[NFTA_SET_DESC_SIZE] = { .type = NLA_U32 },
};
-static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
+static int nft_ctx_init_from_setattr(struct nft_ctx *ctx, struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
- struct net *net = sock_net(skb->sk);
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi = NULL;
struct nft_table *table = NULL;
@@ -2367,11 +2353,9 @@ static int nft_ctx_init_from_setattr(struct nft_ctx *ctx,
table = nf_tables_table_lookup(afi, nla[NFTA_SET_TABLE]);
if (IS_ERR(table))
return PTR_ERR(table);
- if (table->flags & NFT_TABLE_INACTIVE)
- return -ENOENT;
}
- nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla);
return 0;
}
@@ -2500,6 +2484,9 @@ static int nf_tables_fill_set(struct sk_buff *skb, const struct nft_ctx *ctx,
goto nla_put_failure;
}
+ if (nla_put(skb, NFTA_SET_USERDATA, set->udlen, set->udata))
+ goto nla_put_failure;
+
desc = nla_nest_start(skb, NFTA_SET_DESC);
if (desc == NULL)
goto nla_put_failure;
@@ -2619,8 +2606,8 @@ static int nf_tables_dump_sets_done(struct netlink_callback *cb)
return 0;
}
-static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_getset(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nft_set *set;
@@ -2630,7 +2617,7 @@ static int nf_tables_getset(struct sock *nlsk, struct sk_buff *skb,
int err;
/* Verify existence before starting dump */
- err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla);
if (err < 0)
return err;
@@ -2693,14 +2680,13 @@ static int nf_tables_set_desc_parse(const struct nft_ctx *ctx,
return 0;
}
-static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newset(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
const struct nft_set_ops *ops;
struct nft_af_info *afi;
- struct net *net = sock_net(skb->sk);
struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
@@ -2710,6 +2696,8 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
u64 timeout;
u32 ktype, dtype, flags, policy, gc_int;
struct nft_set_desc desc;
+ unsigned char *udata;
+ u16 udlen;
int err;
if (nla[NFTA_SET_TABLE] == NULL ||
@@ -2798,7 +2786,7 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
if (IS_ERR(table))
return PTR_ERR(table);
- nft_ctx_init(&ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(&ctx, net, skb, nlh, afi, table, NULL, nla);
set = nf_tables_set_lookup(table, nla[NFTA_SET_NAME]);
if (IS_ERR(set)) {
@@ -2822,12 +2810,16 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
if (IS_ERR(ops))
return PTR_ERR(ops);
+ udlen = 0;
+ if (nla[NFTA_SET_USERDATA])
+ udlen = nla_len(nla[NFTA_SET_USERDATA]);
+
size = 0;
if (ops->privsize != NULL)
size = ops->privsize(nla);
err = -ENOMEM;
- set = kzalloc(sizeof(*set) + size, GFP_KERNEL);
+ set = kzalloc(sizeof(*set) + size + udlen, GFP_KERNEL);
if (set == NULL)
goto err1;
@@ -2836,6 +2828,12 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
if (err < 0)
goto err2;
+ udata = NULL;
+ if (udlen) {
+ udata = set->data + size;
+ nla_memcpy(udata, nla[NFTA_SET_USERDATA], udlen);
+ }
+
INIT_LIST_HEAD(&set->bindings);
write_pnet(&set->pnet, net);
set->ops = ops;
@@ -2846,6 +2844,8 @@ static int nf_tables_newset(struct sock *nlsk, struct sk_buff *skb,
set->flags = flags;
set->size = desc.size;
set->policy = policy;
+ set->udlen = udlen;
+ set->udata = udata;
set->timeout = timeout;
set->gc_int = gc_int;
@@ -2882,8 +2882,8 @@ static void nf_tables_set_destroy(const struct nft_ctx *ctx, struct nft_set *set
nft_set_destroy(set);
}
-static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delset(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
@@ -2896,15 +2896,13 @@ static int nf_tables_delset(struct sock *nlsk, struct sk_buff *skb,
if (nla[NFTA_SET_TABLE] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_setattr(&ctx, skb, nlh, nla);
+ err = nft_ctx_init_from_setattr(&ctx, net, skb, nlh, nla);
if (err < 0)
return err;
set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_NAME]);
if (IS_ERR(set))
return PTR_ERR(set);
- if (set->flags & NFT_SET_INACTIVE)
- return -ENOENT;
if (!list_empty(&set->bindings))
return -EBUSY;
@@ -3024,16 +3022,14 @@ static const struct nla_policy nft_set_elem_list_policy[NFTA_SET_ELEM_LIST_MAX +
[NFTA_SET_ELEM_LIST_SET_ID] = { .type = NLA_U32 },
};
-static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
+static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx, struct net *net,
const struct sk_buff *skb,
const struct nlmsghdr *nlh,
- const struct nlattr * const nla[],
- bool trans)
+ const struct nlattr * const nla[])
{
const struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nft_af_info *afi;
struct nft_table *table;
- struct net *net = sock_net(skb->sk);
afi = nf_tables_afinfo_lookup(net, nfmsg->nfgen_family, false);
if (IS_ERR(afi))
@@ -3042,10 +3038,8 @@ static int nft_ctx_init_from_elemattr(struct nft_ctx *ctx,
table = nf_tables_table_lookup(afi, nla[NFTA_SET_ELEM_LIST_TABLE]);
if (IS_ERR(table))
return PTR_ERR(table);
- if (!trans && (table->flags & NFT_TABLE_INACTIVE))
- return -ENOENT;
- nft_ctx_init(ctx, skb, nlh, afi, table, NULL, nla);
+ nft_ctx_init(ctx, net, skb, nlh, afi, table, NULL, nla);
return 0;
}
@@ -3135,6 +3129,7 @@ static int nf_tables_dump_setelem(const struct nft_ctx *ctx,
static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct net *net = sock_net(skb->sk);
const struct nft_set *set;
struct nft_set_dump_args args;
struct nft_ctx ctx;
@@ -3150,10 +3145,12 @@ static int nf_tables_dump_set(struct sk_buff *skb, struct netlink_callback *cb)
if (err < 0)
return err;
- err = nft_ctx_init_from_elemattr(&ctx, cb->skb, cb->nlh, (void *)nla,
- false);
+ err = nft_ctx_init_from_elemattr(&ctx, net, cb->skb, cb->nlh,
+ (void *)nla);
if (err < 0)
return err;
+ if (ctx.table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
if (IS_ERR(set))
@@ -3208,17 +3205,19 @@ nla_put_failure:
return -ENOSPC;
}
-static int nf_tables_getsetelem(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_getsetelem(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nft_set *set;
struct nft_ctx ctx;
int err;
- err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla);
if (err < 0)
return err;
+ if (ctx.table->flags & NFT_TABLE_INACTIVE)
+ return -ENOENT;
set = nf_tables_set_lookup(ctx.table, nla[NFTA_SET_ELEM_LIST_SET]);
if (IS_ERR(set))
@@ -3528,11 +3527,10 @@ err1:
return err;
}
-static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_newsetelem(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
- struct net *net = sock_net(skb->sk);
const struct nlattr *attr;
struct nft_set *set;
struct nft_ctx ctx;
@@ -3541,7 +3539,7 @@ static int nf_tables_newsetelem(struct sock *nlsk, struct sk_buff *skb,
if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, true);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla);
if (err < 0)
return err;
@@ -3623,8 +3621,8 @@ err1:
return err;
}
-static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_delsetelem(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
const struct nlattr *attr;
@@ -3635,7 +3633,7 @@ static int nf_tables_delsetelem(struct sock *nlsk, struct sk_buff *skb,
if (nla[NFTA_SET_ELEM_LIST_ELEMENTS] == NULL)
return -EINVAL;
- err = nft_ctx_init_from_elemattr(&ctx, skb, nlh, nla, false);
+ err = nft_ctx_init_from_elemattr(&ctx, net, skb, nlh, nla);
if (err < 0)
return err;
@@ -3739,11 +3737,10 @@ err:
return err;
}
-static int nf_tables_getgen(struct sock *nlsk, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int nf_tables_getgen(struct net *net, struct sock *nlsk,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const nla[])
{
- struct net *net = sock_net(skb->sk);
struct sk_buff *skb2;
int err;
@@ -3887,9 +3884,8 @@ static void nf_tables_commit_release(struct nft_trans *trans)
kfree(trans);
}
-static int nf_tables_commit(struct sk_buff *skb)
+static int nf_tables_commit(struct net *net, struct sk_buff *skb)
{
- struct net *net = sock_net(skb->sk);
struct nft_trans *trans, *next;
struct nft_trans_elem *te;
@@ -4024,13 +4020,13 @@ static void nf_tables_abort_release(struct nft_trans *trans)
kfree(trans);
}
-static int nf_tables_abort(struct sk_buff *skb)
+static int nf_tables_abort(struct net *net, struct sk_buff *skb)
{
- struct net *net = sock_net(skb->sk);
struct nft_trans *trans, *next;
struct nft_trans_elem *te;
- list_for_each_entry_safe(trans, next, &net->nft.commit_list, list) {
+ list_for_each_entry_safe_reverse(trans, next, &net->nft.commit_list,
+ list) {
switch (trans->msg_type) {
case NFT_MSG_NEWTABLE:
if (nft_trans_table_update(trans)) {
@@ -4446,22 +4442,22 @@ static void nft_verdict_uninit(const struct nft_data *data)
}
}
-static int nft_verdict_dump(struct sk_buff *skb, const struct nft_data *data)
+int nft_verdict_dump(struct sk_buff *skb, int type, const struct nft_verdict *v)
{
struct nlattr *nest;
- nest = nla_nest_start(skb, NFTA_DATA_VERDICT);
+ nest = nla_nest_start(skb, type);
if (!nest)
goto nla_put_failure;
- if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(data->verdict.code)))
+ if (nla_put_be32(skb, NFTA_VERDICT_CODE, htonl(v->code)))
goto nla_put_failure;
- switch (data->verdict.code) {
+ switch (v->code) {
case NFT_JUMP:
case NFT_GOTO:
if (nla_put_string(skb, NFTA_VERDICT_CHAIN,
- data->verdict.chain->name))
+ v->chain->name))
goto nla_put_failure;
}
nla_nest_end(skb, nest);
@@ -4572,7 +4568,7 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
err = nft_value_dump(skb, data, len);
break;
case NFT_DATA_VERDICT:
- err = nft_verdict_dump(skb, data);
+ err = nft_verdict_dump(skb, NFTA_DATA_VERDICT, &data->verdict);
break;
default:
err = -EINVAL;
@@ -4584,7 +4580,7 @@ int nft_data_dump(struct sk_buff *skb, int attr, const struct nft_data *data,
}
EXPORT_SYMBOL_GPL(nft_data_dump);
-static int nf_tables_init_net(struct net *net)
+static int __net_init nf_tables_init_net(struct net *net)
{
INIT_LIST_HEAD(&net->nft.af_info);
INIT_LIST_HEAD(&net->nft.commit_list);
@@ -4592,6 +4588,67 @@ static int nf_tables_init_net(struct net *net)
return 0;
}
+int __nft_release_basechain(struct nft_ctx *ctx)
+{
+ struct nft_rule *rule, *nr;
+
+ BUG_ON(!(ctx->chain->flags & NFT_BASE_CHAIN));
+
+ nf_tables_unregister_hooks(ctx->chain->table, ctx->chain,
+ ctx->afi->nops);
+ list_for_each_entry_safe(rule, nr, &ctx->chain->rules, list) {
+ list_del(&rule->list);
+ ctx->chain->use--;
+ nf_tables_rule_destroy(ctx, rule);
+ }
+ list_del(&ctx->chain->list);
+ ctx->table->use--;
+ nf_tables_chain_destroy(ctx->chain);
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(__nft_release_basechain);
+
+/* Called by nft_unregister_afinfo() from __net_exit path, nfnl_lock is held. */
+static void __nft_release_afinfo(struct net *net, struct nft_af_info *afi)
+{
+ struct nft_table *table, *nt;
+ struct nft_chain *chain, *nc;
+ struct nft_rule *rule, *nr;
+ struct nft_set *set, *ns;
+ struct nft_ctx ctx = {
+ .net = net,
+ .afi = afi,
+ };
+
+ list_for_each_entry_safe(table, nt, &afi->tables, list) {
+ list_for_each_entry(chain, &table->chains, list)
+ nf_tables_unregister_hooks(table, chain, afi->nops);
+ /* No packets are walking on these chains anymore. */
+ ctx.table = table;
+ list_for_each_entry(chain, &table->chains, list) {
+ ctx.chain = chain;
+ list_for_each_entry_safe(rule, nr, &chain->rules, list) {
+ list_del(&rule->list);
+ chain->use--;
+ nf_tables_rule_destroy(&ctx, rule);
+ }
+ }
+ list_for_each_entry_safe(set, ns, &table->sets, list) {
+ list_del(&set->list);
+ table->use--;
+ nft_set_destroy(set);
+ }
+ list_for_each_entry_safe(chain, nc, &table->chains, list) {
+ list_del(&chain->list);
+ table->use--;
+ nf_tables_chain_destroy(chain);
+ }
+ list_del(&table->list);
+ nf_tables_table_destroy(&ctx);
+ }
+}
+
static struct pernet_operations nf_tables_net_ops = {
.init = nf_tables_init_net,
};
diff --git a/net/netfilter/nf_tables_core.c b/net/netfilter/nf_tables_core.c
index f3695a497408..e9f8dffcc244 100644
--- a/net/netfilter/nf_tables_core.c
+++ b/net/netfilter/nf_tables_core.c
@@ -16,22 +16,17 @@
#include <linux/skbuff.h>
#include <linux/netlink.h>
#include <linux/netfilter.h>
+#include <linux/static_key.h>
#include <linux/netfilter/nfnetlink.h>
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_log.h>
-enum nft_trace {
- NFT_TRACE_RULE,
- NFT_TRACE_RETURN,
- NFT_TRACE_POLICY,
-};
-
-static const char *const comments[] = {
- [NFT_TRACE_RULE] = "rule",
- [NFT_TRACE_RETURN] = "return",
- [NFT_TRACE_POLICY] = "policy",
+static const char *const comments[__NFT_TRACETYPE_MAX] = {
+ [NFT_TRACETYPE_POLICY] = "policy",
+ [NFT_TRACETYPE_RETURN] = "return",
+ [NFT_TRACETYPE_RULE] = "rule",
};
static struct nf_loginfo trace_loginfo = {
@@ -44,22 +39,36 @@ static struct nf_loginfo trace_loginfo = {
},
};
-static void __nft_trace_packet(const struct nft_pktinfo *pkt,
- const struct nft_chain *chain,
- int rulenum, enum nft_trace type)
+static noinline void __nft_trace_packet(struct nft_traceinfo *info,
+ const struct nft_chain *chain,
+ int rulenum, enum nft_trace_types type)
{
+ const struct nft_pktinfo *pkt = info->pkt;
+
+ if (!info->trace || !pkt->skb->nf_trace)
+ return;
+
+ info->chain = chain;
+ info->type = type;
+
+ nft_trace_notify(info);
+
nf_log_trace(pkt->net, pkt->pf, pkt->hook, pkt->skb, pkt->in,
pkt->out, &trace_loginfo, "TRACE: %s:%s:%s:%u ",
chain->table->name, chain->name, comments[type],
rulenum);
}
-static inline void nft_trace_packet(const struct nft_pktinfo *pkt,
+static inline void nft_trace_packet(struct nft_traceinfo *info,
const struct nft_chain *chain,
- int rulenum, enum nft_trace type)
+ const struct nft_rule *rule,
+ int rulenum,
+ enum nft_trace_types type)
{
- if (unlikely(pkt->skb->nf_trace))
- __nft_trace_packet(pkt, chain, rulenum, type);
+ if (static_branch_unlikely(&nft_trace_enabled)) {
+ info->rule = rule;
+ __nft_trace_packet(info, chain, rulenum, type);
+ }
}
static void nft_cmp_fast_eval(const struct nft_expr *expr,
@@ -121,7 +130,11 @@ nft_do_chain(struct nft_pktinfo *pkt, void *priv)
struct nft_stats *stats;
int rulenum;
unsigned int gencursor = nft_genmask_cur(net);
+ struct nft_traceinfo info;
+ info.trace = false;
+ if (static_branch_unlikely(&nft_trace_enabled))
+ nft_trace_init(&info, pkt, &regs.verdict, basechain);
do_chain:
rulenum = 0;
rule = list_entry(&chain->rules, struct nft_rule, list);
@@ -151,7 +164,8 @@ next_rule:
regs.verdict.code = NFT_CONTINUE;
continue;
case NFT_CONTINUE:
- nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+ nft_trace_packet(&info, chain, rule,
+ rulenum, NFT_TRACETYPE_RULE);
continue;
}
break;
@@ -161,7 +175,8 @@ next_rule:
case NF_ACCEPT:
case NF_DROP:
case NF_QUEUE:
- nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+ nft_trace_packet(&info, chain, rule,
+ rulenum, NFT_TRACETYPE_RULE);
return regs.verdict.code;
}
@@ -174,7 +189,8 @@ next_rule:
stackptr++;
/* fall through */
case NFT_GOTO:
- nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RULE);
+ nft_trace_packet(&info, chain, rule,
+ rulenum, NFT_TRACETYPE_RULE);
chain = regs.verdict.chain;
goto do_chain;
@@ -182,7 +198,8 @@ next_rule:
rulenum++;
/* fall through */
case NFT_RETURN:
- nft_trace_packet(pkt, chain, rulenum, NFT_TRACE_RETURN);
+ nft_trace_packet(&info, chain, rule,
+ rulenum, NFT_TRACETYPE_RETURN);
break;
default:
WARN_ON(1);
@@ -196,7 +213,8 @@ next_rule:
goto next_rule;
}
- nft_trace_packet(pkt, basechain, -1, NFT_TRACE_POLICY);
+ nft_trace_packet(&info, basechain, NULL, -1,
+ NFT_TRACETYPE_POLICY);
rcu_read_lock_bh();
stats = this_cpu_ptr(rcu_dereference(nft_base_chain(basechain)->stats));
diff --git a/net/netfilter/nf_tables_inet.c b/net/netfilter/nf_tables_inet.c
index 9dd2d216cfc1..6b5f76295d3d 100644
--- a/net/netfilter/nf_tables_inet.c
+++ b/net/netfilter/nf_tables_inet.c
@@ -57,7 +57,7 @@ err:
static void __net_exit nf_tables_inet_exit_net(struct net *net)
{
- nft_unregister_afinfo(net->nft.inet);
+ nft_unregister_afinfo(net, net->nft.inet);
kfree(net->nft.inet);
}
diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index 7b9c053ba750..5eefe4a355c6 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -94,7 +94,7 @@ nft_do_chain_netdev(void *priv, struct sk_buff *skb,
{
struct nft_pktinfo pkt;
- switch (eth_hdr(skb)->h_proto) {
+ switch (skb->protocol) {
case htons(ETH_P_IP):
nft_netdev_set_pktinfo_ipv4(&pkt, skb, state);
break;
@@ -139,7 +139,7 @@ err:
static void nf_tables_netdev_exit_net(struct net *net)
{
- nft_unregister_afinfo(net->nft.netdev);
+ nft_unregister_afinfo(net, net->nft.netdev);
kfree(net->nft.netdev);
}
@@ -156,35 +156,17 @@ static const struct nf_chain_type nft_filter_chain_netdev = {
.hook_mask = (1 << NF_NETDEV_INGRESS),
};
-static void nft_netdev_event(unsigned long event, struct nft_af_info *afi,
- struct net_device *dev, struct nft_table *table,
- struct nft_base_chain *basechain)
+static void nft_netdev_event(unsigned long event, struct net_device *dev,
+ struct nft_ctx *ctx)
{
- switch (event) {
- case NETDEV_REGISTER:
- if (strcmp(basechain->dev_name, dev->name) != 0)
- return;
-
- BUG_ON(!(basechain->flags & NFT_BASECHAIN_DISABLED));
+ struct nft_base_chain *basechain = nft_base_chain(ctx->chain);
- dev_hold(dev);
- basechain->ops[0].dev = dev;
- basechain->flags &= ~NFT_BASECHAIN_DISABLED;
- if (!(table->flags & NFT_TABLE_F_DORMANT))
- nft_register_basechain(basechain, afi->nops);
- break;
+ switch (event) {
case NETDEV_UNREGISTER:
if (strcmp(basechain->dev_name, dev->name) != 0)
return;
- BUG_ON(basechain->flags & NFT_BASECHAIN_DISABLED);
-
- if (!(table->flags & NFT_TABLE_F_DORMANT))
- nft_unregister_basechain(basechain, afi->nops);
-
- dev_put(basechain->ops[0].dev);
- basechain->ops[0].dev = NULL;
- basechain->flags |= NFT_BASECHAIN_DISABLED;
+ __nft_release_basechain(ctx);
break;
case NETDEV_CHANGENAME:
if (dev->ifindex != basechain->ops[0].dev->ifindex)
@@ -201,20 +183,29 @@ static int nf_tables_netdev_event(struct notifier_block *this,
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct nft_af_info *afi;
struct nft_table *table;
- struct nft_chain *chain;
+ struct nft_chain *chain, *nr;
+ struct nft_ctx ctx = {
+ .net = dev_net(dev),
+ };
+
+ if (event != NETDEV_UNREGISTER &&
+ event != NETDEV_CHANGENAME)
+ return NOTIFY_DONE;
nfnl_lock(NFNL_SUBSYS_NFTABLES);
list_for_each_entry(afi, &dev_net(dev)->nft.af_info, list) {
+ ctx.afi = afi;
if (afi->family != NFPROTO_NETDEV)
continue;
list_for_each_entry(table, &afi->tables, list) {
- list_for_each_entry(chain, &table->chains, list) {
+ ctx.table = table;
+ list_for_each_entry_safe(chain, nr, &table->chains, list) {
if (!(chain->flags & NFT_BASE_CHAIN))
continue;
- nft_netdev_event(event, afi, dev, table,
- nft_base_chain(chain));
+ ctx.chain = chain;
+ nft_netdev_event(event, dev, &ctx);
}
}
}
@@ -233,12 +224,12 @@ static int __init nf_tables_netdev_init(void)
nft_register_chain_type(&nft_filter_chain_netdev);
ret = register_pernet_subsys(&nf_tables_netdev_net_ops);
- if (ret < 0)
+ if (ret < 0) {
nft_unregister_chain_type(&nft_filter_chain_netdev);
-
+ return ret;
+ }
register_netdevice_notifier(&nf_tables_netdev_notifier);
-
- return ret;
+ return 0;
}
static void __exit nf_tables_netdev_exit(void)
diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
new file mode 100644
index 000000000000..e9e959f65d91
--- /dev/null
+++ b/net/netfilter/nf_tables_trace.c
@@ -0,0 +1,275 @@
+/*
+ * (C) 2015 Red Hat GmbH
+ * Author: Florian Westphal <fw@strlen.de>
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License version 2 as
+ * published by the Free Software Foundation.
+ */
+
+#include <linux/module.h>
+#include <linux/static_key.h>
+#include <linux/hash.h>
+#include <linux/jhash.h>
+#include <linux/if_vlan.h>
+#include <linux/init.h>
+#include <linux/skbuff.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nfnetlink.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
+#include <net/netfilter/nf_tables.h>
+
+#define NFT_TRACETYPE_LL_HSIZE 20
+#define NFT_TRACETYPE_NETWORK_HSIZE 40
+#define NFT_TRACETYPE_TRANSPORT_HSIZE 20
+
+DEFINE_STATIC_KEY_FALSE(nft_trace_enabled);
+EXPORT_SYMBOL_GPL(nft_trace_enabled);
+
+static int trace_fill_id(struct sk_buff *nlskb, struct sk_buff *skb)
+{
+ __be32 id;
+
+ /* using skb address as ID results in a limited number of
+ * values (and quick reuse).
+ *
+ * So we attempt to use as many skb members that will not
+ * change while skb is with netfilter.
+ */
+ id = (__be32)jhash_2words(hash32_ptr(skb), skb_get_hash(skb),
+ skb->skb_iif);
+
+ return nla_put_be32(nlskb, NFTA_TRACE_ID, id);
+}
+
+static int trace_fill_header(struct sk_buff *nlskb, u16 type,
+ const struct sk_buff *skb,
+ int off, unsigned int len)
+{
+ struct nlattr *nla;
+
+ if (len == 0)
+ return 0;
+
+ nla = nla_reserve(nlskb, type, len);
+ if (!nla || skb_copy_bits(skb, off, nla_data(nla), len))
+ return -1;
+
+ return 0;
+}
+
+static int nf_trace_fill_ll_header(struct sk_buff *nlskb,
+ const struct sk_buff *skb)
+{
+ struct vlan_ethhdr veth;
+ int off;
+
+ BUILD_BUG_ON(sizeof(veth) > NFT_TRACETYPE_LL_HSIZE);
+
+ off = skb_mac_header(skb) - skb->data;
+ if (off != -ETH_HLEN)
+ return -1;
+
+ if (skb_copy_bits(skb, off, &veth, ETH_HLEN))
+ return -1;
+
+ veth.h_vlan_proto = skb->vlan_proto;
+ veth.h_vlan_TCI = htons(skb_vlan_tag_get(skb));
+ veth.h_vlan_encapsulated_proto = skb->protocol;
+
+ return nla_put(nlskb, NFTA_TRACE_LL_HEADER, sizeof(veth), &veth);
+}
+
+static int nf_trace_fill_dev_info(struct sk_buff *nlskb,
+ const struct net_device *indev,
+ const struct net_device *outdev)
+{
+ if (indev) {
+ if (nla_put_be32(nlskb, NFTA_TRACE_IIF,
+ htonl(indev->ifindex)))
+ return -1;
+
+ if (nla_put_be16(nlskb, NFTA_TRACE_IIFTYPE,
+ htons(indev->type)))
+ return -1;
+ }
+
+ if (outdev) {
+ if (nla_put_be32(nlskb, NFTA_TRACE_OIF,
+ htonl(outdev->ifindex)))
+ return -1;
+
+ if (nla_put_be16(nlskb, NFTA_TRACE_OIFTYPE,
+ htons(outdev->type)))
+ return -1;
+ }
+
+ return 0;
+}
+
+static int nf_trace_fill_pkt_info(struct sk_buff *nlskb,
+ const struct nft_pktinfo *pkt)
+{
+ const struct sk_buff *skb = pkt->skb;
+ unsigned int len = min_t(unsigned int,
+ pkt->xt.thoff - skb_network_offset(skb),
+ NFT_TRACETYPE_NETWORK_HSIZE);
+ int off = skb_network_offset(skb);
+
+ if (trace_fill_header(nlskb, NFTA_TRACE_NETWORK_HEADER, skb, off, len))
+ return -1;
+
+ len = min_t(unsigned int, skb->len - pkt->xt.thoff,
+ NFT_TRACETYPE_TRANSPORT_HSIZE);
+
+ if (trace_fill_header(nlskb, NFTA_TRACE_TRANSPORT_HEADER, skb,
+ pkt->xt.thoff, len))
+ return -1;
+
+ if (!skb_mac_header_was_set(skb))
+ return 0;
+
+ if (skb_vlan_tag_get(skb))
+ return nf_trace_fill_ll_header(nlskb, skb);
+
+ off = skb_mac_header(skb) - skb->data;
+ len = min_t(unsigned int, -off, NFT_TRACETYPE_LL_HSIZE);
+ return trace_fill_header(nlskb, NFTA_TRACE_LL_HEADER,
+ skb, off, len);
+}
+
+static int nf_trace_fill_rule_info(struct sk_buff *nlskb,
+ const struct nft_traceinfo *info)
+{
+ if (!info->rule)
+ return 0;
+
+ /* a continue verdict with ->type == RETURN means that this is
+ * an implicit return (end of chain reached).
+ *
+ * Since no rule matched, the ->rule pointer is invalid.
+ */
+ if (info->type == NFT_TRACETYPE_RETURN &&
+ info->verdict->code == NFT_CONTINUE)
+ return 0;
+
+ return nla_put_be64(nlskb, NFTA_TRACE_RULE_HANDLE,
+ cpu_to_be64(info->rule->handle));
+}
+
+void nft_trace_notify(struct nft_traceinfo *info)
+{
+ const struct nft_pktinfo *pkt = info->pkt;
+ struct nfgenmsg *nfmsg;
+ struct nlmsghdr *nlh;
+ struct sk_buff *skb;
+ unsigned int size;
+ int event = (NFNL_SUBSYS_NFTABLES << 8) | NFT_MSG_TRACE;
+
+ if (!nfnetlink_has_listeners(pkt->net, NFNLGRP_NFTRACE))
+ return;
+
+ size = nlmsg_total_size(sizeof(struct nfgenmsg)) +
+ nla_total_size(NFT_TABLE_MAXNAMELEN) +
+ nla_total_size(NFT_CHAIN_MAXNAMELEN) +
+ nla_total_size(sizeof(__be64)) + /* rule handle */
+ nla_total_size(sizeof(__be32)) + /* trace type */
+ nla_total_size(0) + /* VERDICT, nested */
+ nla_total_size(sizeof(u32)) + /* verdict code */
+ nla_total_size(NFT_CHAIN_MAXNAMELEN) + /* jump target */
+ nla_total_size(sizeof(u32)) + /* id */
+ nla_total_size(NFT_TRACETYPE_LL_HSIZE) +
+ nla_total_size(NFT_TRACETYPE_NETWORK_HSIZE) +
+ nla_total_size(NFT_TRACETYPE_TRANSPORT_HSIZE) +
+ nla_total_size(sizeof(u32)) + /* iif */
+ nla_total_size(sizeof(__be16)) + /* iiftype */
+ nla_total_size(sizeof(u32)) + /* oif */
+ nla_total_size(sizeof(__be16)) + /* oiftype */
+ nla_total_size(sizeof(u32)) + /* mark */
+ nla_total_size(sizeof(u32)) + /* nfproto */
+ nla_total_size(sizeof(u32)); /* policy */
+
+ skb = nlmsg_new(size, GFP_ATOMIC);
+ if (!skb)
+ return;
+
+ nlh = nlmsg_put(skb, 0, 0, event, sizeof(struct nfgenmsg), 0);
+ if (!nlh)
+ goto nla_put_failure;
+
+ nfmsg = nlmsg_data(nlh);
+ nfmsg->nfgen_family = info->basechain->type->family;
+ nfmsg->version = NFNETLINK_V0;
+ nfmsg->res_id = 0;
+
+ if (nla_put_be32(skb, NFTA_TRACE_NFPROTO, htonl(pkt->pf)))
+ goto nla_put_failure;
+
+ if (nla_put_be32(skb, NFTA_TRACE_TYPE, htonl(info->type)))
+ goto nla_put_failure;
+
+ if (trace_fill_id(skb, pkt->skb))
+ goto nla_put_failure;
+
+ if (info->chain) {
+ if (nla_put_string(skb, NFTA_TRACE_CHAIN,
+ info->chain->name))
+ goto nla_put_failure;
+ if (nla_put_string(skb, NFTA_TRACE_TABLE,
+ info->chain->table->name))
+ goto nla_put_failure;
+ }
+
+ if (nf_trace_fill_rule_info(skb, info))
+ goto nla_put_failure;
+
+ switch (info->type) {
+ case NFT_TRACETYPE_UNSPEC:
+ case __NFT_TRACETYPE_MAX:
+ break;
+ case NFT_TRACETYPE_RETURN:
+ case NFT_TRACETYPE_RULE:
+ if (nft_verdict_dump(skb, NFTA_TRACE_VERDICT, info->verdict))
+ goto nla_put_failure;
+ break;
+ case NFT_TRACETYPE_POLICY:
+ if (nla_put_be32(skb, NFTA_TRACE_POLICY,
+ info->basechain->policy))
+ goto nla_put_failure;
+ break;
+ }
+
+ if (pkt->skb->mark &&
+ nla_put_be32(skb, NFTA_TRACE_MARK, htonl(pkt->skb->mark)))
+ goto nla_put_failure;
+
+ if (!info->packet_dumped) {
+ if (nf_trace_fill_dev_info(skb, pkt->in, pkt->out))
+ goto nla_put_failure;
+
+ if (nf_trace_fill_pkt_info(skb, pkt))
+ goto nla_put_failure;
+ info->packet_dumped = true;
+ }
+
+ nlmsg_end(skb, nlh);
+ nfnetlink_send(skb, pkt->net, 0, NFNLGRP_NFTRACE, 0, GFP_ATOMIC);
+ return;
+
+ nla_put_failure:
+ WARN_ON_ONCE(1);
+ kfree_skb(skb);
+}
+
+void nft_trace_init(struct nft_traceinfo *info, const struct nft_pktinfo *pkt,
+ const struct nft_verdict *verdict,
+ const struct nft_chain *chain)
+{
+ info->basechain = nft_base_chain(chain);
+ info->trace = true;
+ info->packet_dumped = false;
+ info->pkt = pkt;
+ info->verdict = verdict;
+}
diff --git a/net/netfilter/nfnetlink.c b/net/netfilter/nfnetlink.c
index f1d9e887f5b1..2278d9ab723b 100644
--- a/net/netfilter/nfnetlink.c
+++ b/net/netfilter/nfnetlink.c
@@ -33,6 +33,10 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Harald Welte <laforge@netfilter.org>");
MODULE_ALIAS_NET_PF_PROTO(PF_NETLINK, NETLINK_NETFILTER);
+#define nfnl_dereference_protected(id) \
+ rcu_dereference_protected(table[(id)].subsys, \
+ lockdep_nfnl_is_held((id)))
+
static char __initdata nfversion[] = "0.30";
static struct {
@@ -49,6 +53,7 @@ static const int nfnl_group2type[NFNLGRP_MAX+1] = {
[NFNLGRP_CONNTRACK_EXP_DESTROY] = NFNL_SUBSYS_CTNETLINK_EXP,
[NFNLGRP_NFTABLES] = NFNL_SUBSYS_NFTABLES,
[NFNLGRP_ACCT_QUOTA] = NFNL_SUBSYS_ACCT,
+ [NFNLGRP_NFTRACE] = NFNL_SUBSYS_NFTABLES,
};
void nfnl_lock(__u8 subsys_id)
@@ -122,13 +127,6 @@ int nfnetlink_has_listeners(struct net *net, unsigned int group)
}
EXPORT_SYMBOL_GPL(nfnetlink_has_listeners);
-struct sk_buff *nfnetlink_alloc_skb(struct net *net, unsigned int size,
- u32 dst_portid, gfp_t gfp_mask)
-{
- return netlink_alloc_skb(net->nfnl, size, dst_portid, gfp_mask);
-}
-EXPORT_SYMBOL_GPL(nfnetlink_alloc_skb);
-
int nfnetlink_send(struct sk_buff *skb, struct net *net, u32 portid,
unsigned int group, int echo, gfp_t flags)
{
@@ -201,19 +199,18 @@ replay:
}
if (nc->call_rcu) {
- err = nc->call_rcu(net->nfnl, skb, nlh,
+ err = nc->call_rcu(net, net->nfnl, skb, nlh,
(const struct nlattr **)cda);
rcu_read_unlock();
} else {
rcu_read_unlock();
nfnl_lock(subsys_id);
- if (rcu_dereference_protected(table[subsys_id].subsys,
- lockdep_is_held(&table[subsys_id].mutex)) != ss ||
+ if (nfnl_dereference_protected(subsys_id) != ss ||
nfnetlink_find_client(type, ss) != nc)
err = -EAGAIN;
else if (nc->call)
- err = nc->call(net->nfnl, skb, nlh,
- (const struct nlattr **)cda);
+ err = nc->call(net, net->nfnl, skb, nlh,
+ (const struct nlattr **)cda);
else
err = -EINVAL;
nfnl_unlock(subsys_id);
@@ -295,30 +292,26 @@ replay:
if (!skb)
return netlink_ack(oskb, nlh, -ENOMEM);
- skb->sk = oskb->sk;
-
nfnl_lock(subsys_id);
- ss = rcu_dereference_protected(table[subsys_id].subsys,
- lockdep_is_held(&table[subsys_id].mutex));
+ ss = nfnl_dereference_protected(subsys_id);
if (!ss) {
#ifdef CONFIG_MODULES
nfnl_unlock(subsys_id);
request_module("nfnetlink-subsys-%d", subsys_id);
nfnl_lock(subsys_id);
- ss = rcu_dereference_protected(table[subsys_id].subsys,
- lockdep_is_held(&table[subsys_id].mutex));
+ ss = nfnl_dereference_protected(subsys_id);
if (!ss)
#endif
{
nfnl_unlock(subsys_id);
- netlink_ack(skb, nlh, -EOPNOTSUPP);
+ netlink_ack(oskb, nlh, -EOPNOTSUPP);
return kfree_skb(skb);
}
}
if (!ss->commit || !ss->abort) {
nfnl_unlock(subsys_id);
- netlink_ack(skb, nlh, -EOPNOTSUPP);
+ netlink_ack(oskb, nlh, -EOPNOTSUPP);
return kfree_skb(skb);
}
@@ -328,10 +321,12 @@ replay:
nlh = nlmsg_hdr(skb);
err = 0;
- if (nlmsg_len(nlh) < sizeof(struct nfgenmsg) ||
- skb->len < nlh->nlmsg_len) {
- err = -EINVAL;
- goto ack;
+ if (nlh->nlmsg_len < NLMSG_HDRLEN ||
+ skb->len < nlh->nlmsg_len ||
+ nlmsg_len(nlh) < sizeof(struct nfgenmsg)) {
+ nfnl_err_reset(&err_list);
+ status |= NFNL_BATCH_FAILURE;
+ goto done;
}
/* Only requests are handled by the kernel */
@@ -381,7 +376,7 @@ replay:
goto ack;
if (nc->call_batch) {
- err = nc->call_batch(net->nfnl, skb, nlh,
+ err = nc->call_batch(net, net->nfnl, skb, nlh,
(const struct nlattr **)cda);
}
@@ -406,7 +401,7 @@ ack:
* pointing to the batch header.
*/
nfnl_err_reset(&err_list);
- netlink_ack(skb, nlmsg_hdr(oskb), -ENOMEM);
+ netlink_ack(oskb, nlmsg_hdr(oskb), -ENOMEM);
status |= NFNL_BATCH_FAILURE;
goto done;
}
@@ -425,15 +420,15 @@ next:
}
done:
if (status & NFNL_BATCH_REPLAY) {
- ss->abort(oskb);
+ ss->abort(net, oskb);
nfnl_err_reset(&err_list);
nfnl_unlock(subsys_id);
kfree_skb(skb);
goto replay;
} else if (status == NFNL_BATCH_DONE) {
- ss->commit(oskb);
+ ss->commit(net, oskb);
} else {
- ss->abort(oskb);
+ ss->abort(net, oskb);
}
nfnl_err_deliver(&err_list, oskb);
@@ -492,7 +487,7 @@ static int nfnetlink_bind(struct net *net, int group)
type = nfnl_group2type[group];
rcu_read_lock();
- ss = nfnetlink_get_subsys(type);
+ ss = nfnetlink_get_subsys(type << 8);
rcu_read_unlock();
if (!ss)
request_module("nfnetlink-subsys-%d", type);
diff --git a/net/netfilter/nfnetlink_acct.c b/net/netfilter/nfnetlink_acct.c
index fefbf5f0b28d..4c2b4c0c4d5f 100644
--- a/net/netfilter/nfnetlink_acct.c
+++ b/net/netfilter/nfnetlink_acct.c
@@ -46,12 +46,11 @@ struct nfacct_filter {
#define NFACCT_F_QUOTA (NFACCT_F_QUOTA_PKTS | NFACCT_F_QUOTA_BYTES)
#define NFACCT_OVERQUOTA_BIT 2 /* NFACCT_F_OVERQUOTA */
-static int
-nfnl_acct_new(struct sock *nfnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+static int nfnl_acct_new(struct net *net, struct sock *nfnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const tb[])
{
struct nf_acct *nfacct, *matching = NULL;
- struct net *net = sock_net(nfnl);
char *acct_name;
unsigned int size = 0;
u32 flags = 0;
@@ -243,6 +242,9 @@ nfacct_filter_alloc(const struct nlattr * const attr)
if (err < 0)
return ERR_PTR(err);
+ if (!tb[NFACCT_FILTER_MASK] || !tb[NFACCT_FILTER_VALUE])
+ return ERR_PTR(-EINVAL);
+
filter = kzalloc(sizeof(struct nfacct_filter), GFP_KERNEL);
if (!filter)
return ERR_PTR(-ENOMEM);
@@ -253,11 +255,10 @@ nfacct_filter_alloc(const struct nlattr * const attr)
return filter;
}
-static int
-nfnl_acct_get(struct sock *nfnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+static int nfnl_acct_get(struct net *net, struct sock *nfnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const tb[])
{
- struct net *net = sock_net(nfnl);
int ret = -ENOENT;
struct nf_acct *cur;
char *acct_name;
@@ -333,11 +334,10 @@ static int nfnl_acct_try_del(struct nf_acct *cur)
return ret;
}
-static int
-nfnl_acct_del(struct sock *nfnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+static int nfnl_acct_del(struct net *net, struct sock *nfnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const tb[])
{
- struct net *net = sock_net(nfnl);
char *acct_name;
struct nf_acct *cur;
int ret = -ENOENT;
diff --git a/net/netfilter/nfnetlink_cthelper.c b/net/netfilter/nfnetlink_cthelper.c
index 54330fb5efaf..e924e95fcc7f 100644
--- a/net/netfilter/nfnetlink_cthelper.c
+++ b/net/netfilter/nfnetlink_cthelper.c
@@ -286,9 +286,9 @@ nfnl_cthelper_update(const struct nlattr * const tb[],
return 0;
}
-static int
-nfnl_cthelper_new(struct sock *nfnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+static int nfnl_cthelper_new(struct net *net, struct sock *nfnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const tb[])
{
const char *helper_name;
struct nf_conntrack_helper *cur, *helper = NULL;
@@ -498,9 +498,9 @@ out:
return skb->len;
}
-static int
-nfnl_cthelper_get(struct sock *nfnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+static int nfnl_cthelper_get(struct net *net, struct sock *nfnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const tb[])
{
int ret = -ENOENT, i;
struct nf_conntrack_helper *cur;
@@ -570,9 +570,9 @@ nfnl_cthelper_get(struct sock *nfnl, struct sk_buff *skb,
return ret;
}
-static int
-nfnl_cthelper_del(struct sock *nfnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+static int nfnl_cthelper_del(struct net *net, struct sock *nfnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const tb[])
{
char *helper_name = NULL;
struct nf_conntrack_helper *cur;
diff --git a/net/netfilter/nfnetlink_cttimeout.c b/net/netfilter/nfnetlink_cttimeout.c
index c7a2d0e1c462..2671b9deb103 100644
--- a/net/netfilter/nfnetlink_cttimeout.c
+++ b/net/netfilter/nfnetlink_cttimeout.c
@@ -38,8 +38,6 @@ MODULE_LICENSE("GPL");
MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
MODULE_DESCRIPTION("cttimeout: Extended Netfilter Connection Tracking timeout tuning");
-static LIST_HEAD(cttimeout_list);
-
static const struct nla_policy cttimeout_nla_policy[CTA_TIMEOUT_MAX+1] = {
[CTA_TIMEOUT_NAME] = { .type = NLA_NUL_STRING,
.len = CTNL_TIMEOUT_NAME_MAX - 1},
@@ -67,16 +65,15 @@ ctnl_timeout_parse_policy(void *timeouts, struct nf_conntrack_l4proto *l4proto,
return ret;
}
-static int
-cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[])
+static int cttimeout_new_timeout(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
__u16 l3num;
__u8 l4num;
struct nf_conntrack_l4proto *l4proto;
struct ctnl_timeout *timeout, *matching = NULL;
- struct net *net = sock_net(skb->sk);
char *name;
int ret;
@@ -90,7 +87,7 @@ cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,
l3num = ntohs(nla_get_be16(cda[CTA_TIMEOUT_L3PROTO]));
l4num = nla_get_u8(cda[CTA_TIMEOUT_L4PROTO]);
- list_for_each_entry(timeout, &cttimeout_list, head) {
+ list_for_each_entry(timeout, &net->nfct_timeout_list, head) {
if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
continue;
@@ -145,7 +142,7 @@ cttimeout_new_timeout(struct sock *ctnl, struct sk_buff *skb,
timeout->l3num = l3num;
timeout->l4proto = l4proto;
atomic_set(&timeout->refcnt, 1);
- list_add_tail_rcu(&timeout->head, &cttimeout_list);
+ list_add_tail_rcu(&timeout->head, &net->nfct_timeout_list);
return 0;
err:
@@ -209,6 +206,7 @@ nla_put_failure:
static int
ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct net *net = sock_net(skb->sk);
struct ctnl_timeout *cur, *last;
if (cb->args[2])
@@ -219,7 +217,7 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
cb->args[1] = 0;
rcu_read_lock();
- list_for_each_entry_rcu(cur, &cttimeout_list, head) {
+ list_for_each_entry_rcu(cur, &net->nfct_timeout_list, head) {
if (last) {
if (cur != last)
continue;
@@ -240,10 +238,10 @@ ctnl_timeout_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len;
}
-static int
-cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[])
+static int cttimeout_get_timeout(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
int ret = -ENOENT;
char *name;
@@ -260,7 +258,7 @@ cttimeout_get_timeout(struct sock *ctnl, struct sk_buff *skb,
return -EINVAL;
name = nla_data(cda[CTA_TIMEOUT_NAME]);
- list_for_each_entry(cur, &cttimeout_list, head) {
+ list_for_each_entry(cur, &net->nfct_timeout_list, head) {
struct sk_buff *skb2;
if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
@@ -301,17 +299,17 @@ static void untimeout(struct nf_conntrack_tuple_hash *i,
RCU_INIT_POINTER(timeout_ext->timeout, NULL);
}
-static void ctnl_untimeout(struct ctnl_timeout *timeout)
+static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout)
{
struct nf_conntrack_tuple_hash *h;
const struct hlist_nulls_node *nn;
int i;
local_bh_disable();
- for (i = 0; i < init_net.ct.htable_size; i++) {
- spin_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
- if (i < init_net.ct.htable_size) {
- hlist_nulls_for_each_entry(h, nn, &init_net.ct.hash[i], hnnode)
+ for (i = 0; i < net->ct.htable_size; i++) {
+ nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
+ if (i < net->ct.htable_size) {
+ hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
untimeout(h, timeout);
}
spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
@@ -320,7 +318,7 @@ static void ctnl_untimeout(struct ctnl_timeout *timeout)
}
/* try to delete object, fail if it is still in use. */
-static int ctnl_timeout_try_del(struct ctnl_timeout *timeout)
+static int ctnl_timeout_try_del(struct net *net, struct ctnl_timeout *timeout)
{
int ret = 0;
@@ -329,7 +327,7 @@ static int ctnl_timeout_try_del(struct ctnl_timeout *timeout)
/* We are protected by nfnl mutex. */
list_del_rcu(&timeout->head);
nf_ct_l4proto_put(timeout->l4proto);
- ctnl_untimeout(timeout);
+ ctnl_untimeout(net, timeout);
kfree_rcu(timeout, rcu_head);
} else {
/* still in use, restore reference counter. */
@@ -339,28 +337,28 @@ static int ctnl_timeout_try_del(struct ctnl_timeout *timeout)
return ret;
}
-static int
-cttimeout_del_timeout(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[])
+static int cttimeout_del_timeout(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
- char *name;
struct ctnl_timeout *cur;
int ret = -ENOENT;
+ char *name;
if (!cda[CTA_TIMEOUT_NAME]) {
- list_for_each_entry(cur, &cttimeout_list, head)
- ctnl_timeout_try_del(cur);
+ list_for_each_entry(cur, &net->nfct_timeout_list, head)
+ ctnl_timeout_try_del(net, cur);
return 0;
}
name = nla_data(cda[CTA_TIMEOUT_NAME]);
- list_for_each_entry(cur, &cttimeout_list, head) {
+ list_for_each_entry(cur, &net->nfct_timeout_list, head) {
if (strncmp(cur->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
continue;
- ret = ctnl_timeout_try_del(cur);
+ ret = ctnl_timeout_try_del(net, cur);
if (ret < 0)
return ret;
@@ -369,15 +367,14 @@ cttimeout_del_timeout(struct sock *ctnl, struct sk_buff *skb,
return ret;
}
-static int
-cttimeout_default_set(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const cda[])
+static int cttimeout_default_set(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const cda[])
{
__u16 l3num;
__u8 l4num;
struct nf_conntrack_l4proto *l4proto;
- struct net *net = sock_net(skb->sk);
unsigned int *timeouts;
int ret;
@@ -459,14 +456,14 @@ nla_put_failure:
return -1;
}
-static int cttimeout_default_get(struct sock *ctnl, struct sk_buff *skb,
+static int cttimeout_default_get(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const cda[])
{
__u16 l3num;
__u8 l4num;
struct nf_conntrack_l4proto *l4proto;
- struct net *net = sock_net(skb->sk);
struct sk_buff *skb2;
int ret, err;
@@ -511,12 +508,13 @@ err:
}
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
-static struct ctnl_timeout *ctnl_timeout_find_get(const char *name)
+static struct ctnl_timeout *
+ctnl_timeout_find_get(struct net *net, const char *name)
{
struct ctnl_timeout *timeout, *matching = NULL;
rcu_read_lock();
- list_for_each_entry_rcu(timeout, &cttimeout_list, head) {
+ list_for_each_entry_rcu(timeout, &net->nfct_timeout_list, head) {
if (strncmp(timeout->name, name, CTNL_TIMEOUT_NAME_MAX) != 0)
continue;
@@ -569,10 +567,39 @@ static const struct nfnetlink_subsystem cttimeout_subsys = {
MODULE_ALIAS_NFNL_SUBSYS(NFNL_SUBSYS_CTNETLINK_TIMEOUT);
+static int __net_init cttimeout_net_init(struct net *net)
+{
+ INIT_LIST_HEAD(&net->nfct_timeout_list);
+
+ return 0;
+}
+
+static void __net_exit cttimeout_net_exit(struct net *net)
+{
+ struct ctnl_timeout *cur, *tmp;
+
+ ctnl_untimeout(net, NULL);
+
+ list_for_each_entry_safe(cur, tmp, &net->nfct_timeout_list, head) {
+ list_del_rcu(&cur->head);
+ nf_ct_l4proto_put(cur->l4proto);
+ kfree_rcu(cur, rcu_head);
+ }
+}
+
+static struct pernet_operations cttimeout_ops = {
+ .init = cttimeout_net_init,
+ .exit = cttimeout_net_exit,
+};
+
static int __init cttimeout_init(void)
{
int ret;
+ ret = register_pernet_subsys(&cttimeout_ops);
+ if (ret < 0)
+ return ret;
+
ret = nfnetlink_subsys_register(&cttimeout_subsys);
if (ret < 0) {
pr_err("cttimeout_init: cannot register cttimeout with "
@@ -586,28 +613,17 @@ static int __init cttimeout_init(void)
return 0;
err_out:
+ unregister_pernet_subsys(&cttimeout_ops);
return ret;
}
static void __exit cttimeout_exit(void)
{
- struct ctnl_timeout *cur, *tmp;
-
pr_info("cttimeout: unregistering from nfnetlink.\n");
nfnetlink_subsys_unregister(&cttimeout_subsys);
- /* Make sure no conntrack objects refer to custom timeouts anymore. */
- ctnl_untimeout(NULL);
-
- list_for_each_entry_safe(cur, tmp, &cttimeout_list, head) {
- list_del_rcu(&cur->head);
- /* We are sure that our objects have no clients at this point,
- * it's safe to release them all without checking refcnt.
- */
- nf_ct_l4proto_put(cur->l4proto);
- kfree_rcu(cur, rcu_head);
- }
+ unregister_pernet_subsys(&cttimeout_ops);
#ifdef CONFIG_NF_CONNTRACK_TIMEOUT
RCU_INIT_POINTER(nf_ct_timeout_find_get_hook, NULL);
RCU_INIT_POINTER(nf_ct_timeout_put_hook, NULL);
diff --git a/net/netfilter/nfnetlink_log.c b/net/netfilter/nfnetlink_log.c
index 06eb48fceb42..11f81c8385fc 100644
--- a/net/netfilter/nfnetlink_log.c
+++ b/net/netfilter/nfnetlink_log.c
@@ -293,24 +293,20 @@ nfulnl_set_nlbufsiz(struct nfulnl_instance *inst, u_int32_t nlbufsiz)
return status;
}
-static int
+static void
nfulnl_set_timeout(struct nfulnl_instance *inst, u_int32_t timeout)
{
spin_lock_bh(&inst->lock);
inst->flushtimeout = timeout;
spin_unlock_bh(&inst->lock);
-
- return 0;
}
-static int
+static void
nfulnl_set_qthresh(struct nfulnl_instance *inst, u_int32_t qthresh)
{
spin_lock_bh(&inst->lock);
inst->qthreshold = qthresh;
spin_unlock_bh(&inst->lock);
-
- return 0;
}
static int
@@ -334,14 +330,13 @@ nfulnl_alloc_skb(struct net *net, u32 peer_portid, unsigned int inst_size,
* message. WARNING: has to be <= 128k due to slab restrictions */
n = max(inst_size, pkt_size);
- skb = nfnetlink_alloc_skb(net, n, peer_portid, GFP_ATOMIC);
+ skb = alloc_skb(n, GFP_ATOMIC);
if (!skb) {
if (n > pkt_size) {
/* try to allocate only as much as we need for current
* packet */
- skb = nfnetlink_alloc_skb(net, pkt_size,
- peer_portid, GFP_ATOMIC);
+ skb = alloc_skb(pkt_size, GFP_ATOMIC);
}
}
@@ -789,10 +784,9 @@ static struct notifier_block nfulnl_rtnl_notifier = {
.notifier_call = nfulnl_rcv_nl_event,
};
-static int
-nfulnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[])
+static int nfulnl_recv_unsupp(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
{
return -ENOTSUPP;
}
@@ -813,19 +807,17 @@ static const struct nla_policy nfula_cfg_policy[NFULA_CFG_MAX+1] = {
[NFULA_CFG_FLAGS] = { .type = NLA_U16 },
};
-static int
-nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nfula[])
+static int nfulnl_recv_config(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const nfula[])
{
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int16_t group_num = ntohs(nfmsg->res_id);
struct nfulnl_instance *inst;
struct nfulnl_msg_config_cmd *cmd = NULL;
- struct net *net = sock_net(ctnl);
struct nfnl_log_net *log = nfnl_log_pernet(net);
int ret = 0;
- u16 flags;
+ u16 flags = 0;
if (nfula[NFULA_CFG_CMD]) {
u_int8_t pf = nfmsg->nfgen_family;
@@ -895,7 +887,7 @@ nfulnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
goto out_put;
default:
ret = -ENOTSUPP;
- break;
+ goto out_put;
}
} else if (!inst) {
ret = -ENODEV;
@@ -1064,15 +1056,26 @@ static int __net_init nfnl_log_net_init(struct net *net)
{
unsigned int i;
struct nfnl_log_net *log = nfnl_log_pernet(net);
+#ifdef CONFIG_PROC_FS
+ struct proc_dir_entry *proc;
+ kuid_t root_uid;
+ kgid_t root_gid;
+#endif
for (i = 0; i < INSTANCE_BUCKETS; i++)
INIT_HLIST_HEAD(&log->instance_table[i]);
spin_lock_init(&log->instances_lock);
#ifdef CONFIG_PROC_FS
- if (!proc_create("nfnetlink_log", 0440,
- net->nf.proc_netfilter, &nful_file_ops))
+ proc = proc_create("nfnetlink_log", 0440,
+ net->nf.proc_netfilter, &nful_file_ops);
+ if (!proc)
return -ENOMEM;
+
+ root_uid = make_kuid(net->user_ns, 0);
+ root_gid = make_kgid(net->user_ns, 0);
+ if (uid_valid(root_uid) && gid_valid(root_gid))
+ proc_set_user(proc, root_uid, root_gid);
#endif
return 0;
}
diff --git a/net/netfilter/nfnetlink_queue.c b/net/netfilter/nfnetlink_queue.c
index 7d81d280cb4f..cb5b630a645b 100644
--- a/net/netfilter/nfnetlink_queue.c
+++ b/net/netfilter/nfnetlink_queue.c
@@ -301,7 +301,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
__be32 **packet_id_ptr)
{
size_t size;
- size_t data_len = 0, cap_len = 0, rem_len = 0;
+ size_t data_len = 0, cap_len = 0;
unsigned int hlen = 0;
struct sk_buff *skb;
struct nlattr *nla;
@@ -361,12 +361,12 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
hlen = min_t(unsigned int, hlen, data_len);
size += sizeof(struct nlattr) + hlen;
cap_len = entskb->len;
- rem_len = data_len - hlen;
break;
}
+ nfnl_ct = rcu_dereference(nfnl_ct_hook);
+
if (queue->flags & NFQA_CFG_F_CONNTRACK) {
- nfnl_ct = rcu_dereference(nfnl_ct_hook);
if (nfnl_ct != NULL) {
ct = nfnl_ct->get_ct(entskb, &ctinfo);
if (ct != NULL)
@@ -385,8 +385,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
size += nla_total_size(seclen);
}
- skb = __netlink_alloc_skb(net->nfnl, size, rem_len, queue->peer_portid,
- GFP_ATOMIC);
+ skb = alloc_skb(size, GFP_ATOMIC);
if (!skb) {
skb_tx_error(entskb);
return NULL;
@@ -583,7 +582,12 @@ __nfqnl_enqueue_packet(struct net *net, struct nfqnl_instance *queue,
/* nfnetlink_unicast will either free the nskb or add it to a socket */
err = nfnetlink_unicast(nskb, net, queue->peer_portid, MSG_DONTWAIT);
if (err < 0) {
- queue->queue_user_dropped++;
+ if (queue->flags & NFQA_CFG_F_FAIL_OPEN) {
+ failopen = 1;
+ err = 0;
+ } else {
+ queue->queue_user_dropped++;
+ }
goto err_out_unlock;
}
@@ -956,10 +960,10 @@ static int nfq_id_after(unsigned int id, unsigned int max)
return (int)(id - max) > 0;
}
-static int
-nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[])
+static int nfqnl_recv_verdict_batch(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
{
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
struct nf_queue_entry *entry, *tmp;
@@ -968,8 +972,6 @@ nfqnl_recv_verdict_batch(struct sock *ctnl, struct sk_buff *skb,
struct nfqnl_instance *queue;
LIST_HEAD(batch_list);
u16 queue_num = ntohs(nfmsg->res_id);
-
- struct net *net = sock_net(ctnl);
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
queue = verdict_instance_lookup(q, queue_num,
@@ -1028,14 +1030,13 @@ static struct nf_conn *nfqnl_ct_parse(struct nfnl_ct_hook *nfnl_ct,
return ct;
}
-static int
-nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[])
+static int nfqnl_recv_verdict(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
+ const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
{
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int16_t queue_num = ntohs(nfmsg->res_id);
-
struct nfqnl_msg_verdict_hdr *vhdr;
struct nfqnl_instance *queue;
unsigned int verdict;
@@ -1043,8 +1044,6 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
enum ip_conntrack_info uninitialized_var(ctinfo);
struct nfnl_ct_hook *nfnl_ct;
struct nf_conn *ct = NULL;
-
- struct net *net = sock_net(ctnl);
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
queue = instance_lookup(q, queue_num);
@@ -1064,9 +1063,10 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
if (entry == NULL)
return -ENOENT;
+ /* rcu lock already held from nfnl->call_rcu. */
+ nfnl_ct = rcu_dereference(nfnl_ct_hook);
+
if (nfqa[NFQA_CT]) {
- /* rcu lock already held from nfnl->call_rcu. */
- nfnl_ct = rcu_dereference(nfnl_ct_hook);
if (nfnl_ct != NULL)
ct = nfqnl_ct_parse(nfnl_ct, nlh, nfqa, entry, &ctinfo);
}
@@ -1090,10 +1090,9 @@ nfqnl_recv_verdict(struct sock *ctnl, struct sk_buff *skb,
return 0;
}
-static int
-nfqnl_recv_unsupp(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[])
+static int nfqnl_recv_unsupp(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
{
return -ENOTSUPP;
}
@@ -1108,17 +1107,16 @@ static const struct nf_queue_handler nfqh = {
.nf_hook_drop = &nfqnl_nf_hook_drop,
};
-static int
-nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
- const struct nlattr * const nfqa[])
+static int nfqnl_recv_config(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const nfqa[])
{
struct nfgenmsg *nfmsg = nlmsg_data(nlh);
u_int16_t queue_num = ntohs(nfmsg->res_id);
struct nfqnl_instance *queue;
struct nfqnl_msg_config_cmd *cmd = NULL;
- struct net *net = sock_net(ctnl);
struct nfnl_queue_net *q = nfnl_queue_pernet(net);
+ __u32 flags = 0, mask = 0;
int ret = 0;
if (nfqa[NFQA_CFG_CMD]) {
@@ -1131,6 +1129,40 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
}
}
+ /* Check if we support these flags in first place, dependencies should
+ * be there too not to break atomicity.
+ */
+ if (nfqa[NFQA_CFG_FLAGS]) {
+ if (!nfqa[NFQA_CFG_MASK]) {
+ /* A mask is needed to specify which flags are being
+ * changed.
+ */
+ return -EINVAL;
+ }
+
+ flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS]));
+ mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK]));
+
+ if (flags >= NFQA_CFG_F_MAX)
+ return -EOPNOTSUPP;
+
+#if !IS_ENABLED(CONFIG_NETWORK_SECMARK)
+ if (flags & mask & NFQA_CFG_F_SECCTX)
+ return -EOPNOTSUPP;
+#endif
+ if ((flags & mask & NFQA_CFG_F_CONNTRACK) &&
+ !rcu_access_pointer(nfnl_ct_hook)) {
+#ifdef CONFIG_MODULES
+ nfnl_unlock(NFNL_SUBSYS_QUEUE);
+ request_module("ip_conntrack_netlink");
+ nfnl_lock(NFNL_SUBSYS_QUEUE);
+ if (rcu_access_pointer(nfnl_ct_hook))
+ return -EAGAIN;
+#endif
+ return -EOPNOTSUPP;
+ }
+ }
+
rcu_read_lock();
queue = instance_lookup(q, queue_num);
if (queue && queue->peer_portid != NETLINK_CB(skb).portid) {
@@ -1158,70 +1190,38 @@ nfqnl_recv_config(struct sock *ctnl, struct sk_buff *skb,
goto err_out_unlock;
}
instance_destroy(q, queue);
- break;
+ goto err_out_unlock;
case NFQNL_CFG_CMD_PF_BIND:
case NFQNL_CFG_CMD_PF_UNBIND:
break;
default:
ret = -ENOTSUPP;
- break;
+ goto err_out_unlock;
}
}
+ if (!queue) {
+ ret = -ENODEV;
+ goto err_out_unlock;
+ }
+
if (nfqa[NFQA_CFG_PARAMS]) {
- struct nfqnl_msg_config_params *params;
+ struct nfqnl_msg_config_params *params =
+ nla_data(nfqa[NFQA_CFG_PARAMS]);
- if (!queue) {
- ret = -ENODEV;
- goto err_out_unlock;
- }
- params = nla_data(nfqa[NFQA_CFG_PARAMS]);
nfqnl_set_mode(queue, params->copy_mode,
ntohl(params->copy_range));
}
if (nfqa[NFQA_CFG_QUEUE_MAXLEN]) {
- __be32 *queue_maxlen;
+ __be32 *queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]);
- if (!queue) {
- ret = -ENODEV;
- goto err_out_unlock;
- }
- queue_maxlen = nla_data(nfqa[NFQA_CFG_QUEUE_MAXLEN]);
spin_lock_bh(&queue->lock);
queue->queue_maxlen = ntohl(*queue_maxlen);
spin_unlock_bh(&queue->lock);
}
if (nfqa[NFQA_CFG_FLAGS]) {
- __u32 flags, mask;
-
- if (!queue) {
- ret = -ENODEV;
- goto err_out_unlock;
- }
-
- if (!nfqa[NFQA_CFG_MASK]) {
- /* A mask is needed to specify which flags are being
- * changed.
- */
- ret = -EINVAL;
- goto err_out_unlock;
- }
-
- flags = ntohl(nla_get_be32(nfqa[NFQA_CFG_FLAGS]));
- mask = ntohl(nla_get_be32(nfqa[NFQA_CFG_MASK]));
-
- if (flags >= NFQA_CFG_F_MAX) {
- ret = -EOPNOTSUPP;
- goto err_out_unlock;
- }
-#if !IS_ENABLED(CONFIG_NETWORK_SECMARK)
- if (flags & mask & NFQA_CFG_F_SECCTX) {
- ret = -EOPNOTSUPP;
- goto err_out_unlock;
- }
-#endif
spin_lock_bh(&queue->lock);
queue->flags &= ~mask;
queue->flags |= flags & mask;
@@ -1417,6 +1417,7 @@ static int __init nfnetlink_queue_init(void)
cleanup_netlink_notifier:
netlink_unregister_notifier(&nfqnl_rtnl_notifier);
+ unregister_pernet_subsys(&nfnl_queue_net_ops);
out:
return status;
}
diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index fde5145f2e36..b78c28ba465f 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -8,6 +8,7 @@
* Development of this code funded by Astaro AG (http://www.astaro.com/)
*/
+#include <asm/unaligned.h>
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/module.h>
@@ -39,6 +40,25 @@ static void nft_byteorder_eval(const struct nft_expr *expr,
d = (void *)dst;
switch (priv->size) {
+ case 8: {
+ u64 src64;
+
+ switch (priv->op) {
+ case NFT_BYTEORDER_NTOH:
+ for (i = 0; i < priv->len / 8; i++) {
+ src64 = get_unaligned((u64 *)&src[i]);
+ put_unaligned_be64(src64, &dst[i]);
+ }
+ break;
+ case NFT_BYTEORDER_HTON:
+ for (i = 0; i < priv->len / 8; i++) {
+ src64 = get_unaligned_be64(&src[i]);
+ put_unaligned(src64, (u64 *)&dst[i]);
+ }
+ break;
+ }
+ break;
+ }
case 4:
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
@@ -101,6 +121,7 @@ static int nft_byteorder_init(const struct nft_ctx *ctx,
switch (priv->size) {
case 2:
case 4:
+ case 8:
break;
default:
return -EINVAL;
diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
index 9c8fab00164b..6228c422c766 100644
--- a/net/netfilter/nft_compat.c
+++ b/net/netfilter/nft_compat.c
@@ -519,9 +519,9 @@ nla_put_failure:
return -1;
}
-static int
-nfnl_compat_get(struct sock *nfnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh, const struct nlattr * const tb[])
+static int nfnl_compat_get(struct net *net, struct sock *nfnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
+ const struct nlattr * const tb[])
{
int ret = 0, target;
struct nfgenmsg *nfmsg;
@@ -660,6 +660,9 @@ nft_match_select_ops(const struct nft_ctx *ctx,
if (IS_ERR(match))
return ERR_PTR(-ENOENT);
+ if (match->matchsize > nla_len(tb[NFTA_MATCH_INFO]))
+ return ERR_PTR(-EINVAL);
+
/* This is the first time we use this match, allocate operations */
nft_match = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
if (nft_match == NULL)
@@ -740,6 +743,9 @@ nft_target_select_ops(const struct nft_ctx *ctx,
if (IS_ERR(target))
return ERR_PTR(-ENOENT);
+ if (target->targetsize > nla_len(tb[NFTA_TARGET_INFO]))
+ return ERR_PTR(-EINVAL);
+
/* This is the first time we use this target, allocate operations */
nft_target = kzalloc(sizeof(struct nft_xt), GFP_KERNEL);
if (nft_target == NULL)
diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
index 1067fb4c1ffa..c9743f78f219 100644
--- a/net/netfilter/nft_counter.c
+++ b/net/netfilter/nft_counter.c
@@ -47,27 +47,34 @@ static void nft_counter_eval(const struct nft_expr *expr,
local_bh_enable();
}
-static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+static void nft_counter_fetch(const struct nft_counter_percpu __percpu *counter,
+ struct nft_counter *total)
{
- struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
- struct nft_counter_percpu *cpu_stats;
- struct nft_counter total;
+ const struct nft_counter_percpu *cpu_stats;
u64 bytes, packets;
unsigned int seq;
int cpu;
- memset(&total, 0, sizeof(total));
+ memset(total, 0, sizeof(*total));
for_each_possible_cpu(cpu) {
- cpu_stats = per_cpu_ptr(priv->counter, cpu);
+ cpu_stats = per_cpu_ptr(counter, cpu);
do {
seq = u64_stats_fetch_begin_irq(&cpu_stats->syncp);
bytes = cpu_stats->counter.bytes;
packets = cpu_stats->counter.packets;
} while (u64_stats_fetch_retry_irq(&cpu_stats->syncp, seq));
- total.packets += packets;
- total.bytes += bytes;
+ total->packets += packets;
+ total->bytes += bytes;
}
+}
+
+static int nft_counter_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(expr);
+ struct nft_counter total;
+
+ nft_counter_fetch(priv->counter, &total);
if (nla_put_be64(skb, NFTA_COUNTER_BYTES, cpu_to_be64(total.bytes)) ||
nla_put_be64(skb, NFTA_COUNTER_PACKETS, cpu_to_be64(total.packets)))
@@ -93,7 +100,7 @@ static int nft_counter_init(const struct nft_ctx *ctx,
cpu_stats = netdev_alloc_pcpu_stats(struct nft_counter_percpu);
if (cpu_stats == NULL)
- return ENOMEM;
+ return -ENOMEM;
preempt_disable();
this_cpu = this_cpu_ptr(cpu_stats);
@@ -118,6 +125,31 @@ static void nft_counter_destroy(const struct nft_ctx *ctx,
free_percpu(priv->counter);
}
+static int nft_counter_clone(struct nft_expr *dst, const struct nft_expr *src)
+{
+ struct nft_counter_percpu_priv *priv = nft_expr_priv(src);
+ struct nft_counter_percpu_priv *priv_clone = nft_expr_priv(dst);
+ struct nft_counter_percpu __percpu *cpu_stats;
+ struct nft_counter_percpu *this_cpu;
+ struct nft_counter total;
+
+ nft_counter_fetch(priv->counter, &total);
+
+ cpu_stats = __netdev_alloc_pcpu_stats(struct nft_counter_percpu,
+ GFP_ATOMIC);
+ if (cpu_stats == NULL)
+ return -ENOMEM;
+
+ preempt_disable();
+ this_cpu = this_cpu_ptr(cpu_stats);
+ this_cpu->counter.packets = total.packets;
+ this_cpu->counter.bytes = total.bytes;
+ preempt_enable();
+
+ priv_clone->counter = cpu_stats;
+ return 0;
+}
+
static struct nft_expr_type nft_counter_type;
static const struct nft_expr_ops nft_counter_ops = {
.type = &nft_counter_type,
@@ -126,6 +158,7 @@ static const struct nft_expr_ops nft_counter_ops = {
.init = nft_counter_init,
.destroy = nft_counter_destroy,
.dump = nft_counter_dump,
+ .clone = nft_counter_clone,
};
static struct nft_expr_type nft_counter_type __read_mostly = {
diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index 8cbca3432f90..d4a4619fcebc 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -16,6 +16,7 @@
#include <linux/netfilter/nf_tables.h>
#include <net/netfilter/nf_tables.h>
#include <net/netfilter/nf_conntrack.h>
+#include <net/netfilter/nf_conntrack_acct.h>
#include <net/netfilter/nf_conntrack_tuple.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_ecache.h>
@@ -30,6 +31,18 @@ struct nft_ct {
};
};
+static u64 nft_ct_get_eval_counter(const struct nf_conn_counter *c,
+ enum nft_ct_keys k,
+ enum ip_conntrack_dir d)
+{
+ if (d < IP_CT_DIR_MAX)
+ return k == NFT_CT_BYTES ? atomic64_read(&c[d].bytes) :
+ atomic64_read(&c[d].packets);
+
+ return nft_ct_get_eval_counter(c, k, IP_CT_DIR_ORIGINAL) +
+ nft_ct_get_eval_counter(c, k, IP_CT_DIR_REPLY);
+}
+
static void nft_ct_get_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -115,6 +128,17 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
return;
}
#endif
+ case NFT_CT_BYTES: /* fallthrough */
+ case NFT_CT_PKTS: {
+ const struct nf_conn_acct *acct = nf_conn_acct_find(ct);
+ u64 count = 0;
+
+ if (acct)
+ count = nft_ct_get_eval_counter(acct->counter,
+ priv->key, priv->dir);
+ memcpy(dest, &count, sizeof(count));
+ return;
+ }
default:
break;
}
@@ -291,6 +315,13 @@ static int nft_ct_get_init(const struct nft_ctx *ctx,
return -EINVAL;
len = FIELD_SIZEOF(struct nf_conntrack_tuple, src.u.all);
break;
+ case NFT_CT_BYTES:
+ case NFT_CT_PKTS:
+ /* no direction? return sum of original + reply */
+ if (tb[NFTA_CT_DIRECTION] == NULL)
+ priv->dir = IP_CT_DIR_MAX;
+ len = sizeof(u64);
+ break;
default:
return -EOPNOTSUPP;
}
@@ -366,6 +397,7 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
goto nla_put_failure;
switch (priv->key) {
+ case NFT_CT_L3PROTOCOL:
case NFT_CT_PROTOCOL:
case NFT_CT_SRC:
case NFT_CT_DST:
@@ -373,6 +405,13 @@ static int nft_ct_get_dump(struct sk_buff *skb, const struct nft_expr *expr)
case NFT_CT_PROTO_DST:
if (nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
goto nla_put_failure;
+ break;
+ case NFT_CT_BYTES:
+ case NFT_CT_PKTS:
+ if (priv->dir < IP_CT_DIR_MAX &&
+ nla_put_u8(skb, NFTA_CT_DIRECTION, priv->dir))
+ goto nla_put_failure;
+ break;
default:
break;
}
diff --git a/net/netfilter/nft_dup_netdev.c b/net/netfilter/nft_dup_netdev.c
new file mode 100644
index 000000000000..2cc1e0ef56e8
--- /dev/null
+++ b/net/netfilter/nft_dup_netdev.c
@@ -0,0 +1,97 @@
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_dup_netdev.h>
+
+struct nft_dup_netdev {
+ enum nft_registers sreg_dev:8;
+};
+
+static void nft_dup_netdev_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_dup_netdev *priv = nft_expr_priv(expr);
+ int oif = regs->data[priv->sreg_dev];
+
+ nf_dup_netdev_egress(pkt, oif);
+}
+
+static const struct nla_policy nft_dup_netdev_policy[NFTA_DUP_MAX + 1] = {
+ [NFTA_DUP_SREG_DEV] = { .type = NLA_U32 },
+};
+
+static int nft_dup_netdev_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_dup_netdev *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_DUP_SREG_DEV] == NULL)
+ return -EINVAL;
+
+ priv->sreg_dev = nft_parse_register(tb[NFTA_DUP_SREG_DEV]);
+ return nft_validate_register_load(priv->sreg_dev, sizeof(int));
+}
+
+static const struct nft_expr_ops nft_dup_netdev_ingress_ops;
+
+static int nft_dup_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_dup_netdev *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_DUP_SREG_DEV, priv->sreg_dev))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_dup_netdev_type;
+static const struct nft_expr_ops nft_dup_netdev_ops = {
+ .type = &nft_dup_netdev_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_dup_netdev)),
+ .eval = nft_dup_netdev_eval,
+ .init = nft_dup_netdev_init,
+ .dump = nft_dup_netdev_dump,
+};
+
+static struct nft_expr_type nft_dup_netdev_type __read_mostly = {
+ .family = NFPROTO_NETDEV,
+ .name = "dup",
+ .ops = &nft_dup_netdev_ops,
+ .policy = nft_dup_netdev_policy,
+ .maxattr = NFTA_DUP_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_dup_netdev_module_init(void)
+{
+ return nft_register_expr(&nft_dup_netdev_type);
+}
+
+static void __exit nft_dup_netdev_module_exit(void)
+{
+ nft_unregister_expr(&nft_dup_netdev_type);
+}
+
+module_init(nft_dup_netdev_module_init);
+module_exit(nft_dup_netdev_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_AF_EXPR(5, "dup");
diff --git a/net/netfilter/nft_dynset.c b/net/netfilter/nft_dynset.c
index 513a8ef60a59..9dec3bd1b63c 100644
--- a/net/netfilter/nft_dynset.c
+++ b/net/netfilter/nft_dynset.c
@@ -50,8 +50,9 @@ static void *nft_dynset_new(struct nft_set *set, const struct nft_expr *expr,
}
ext = nft_set_elem_ext(set, elem);
- if (priv->expr != NULL)
- nft_expr_clone(nft_set_ext_expr(ext), priv->expr);
+ if (priv->expr != NULL &&
+ nft_expr_clone(nft_set_ext_expr(ext), priv->expr) < 0)
+ return NULL;
return elem;
}
diff --git a/net/netfilter/nft_fwd_netdev.c b/net/netfilter/nft_fwd_netdev.c
new file mode 100644
index 000000000000..763ebc3e0b2b
--- /dev/null
+++ b/net/netfilter/nft_fwd_netdev.c
@@ -0,0 +1,98 @@
+/*
+ * Copyright (c) 2015 Pablo Neira Ayuso <pablo@netfilter.org>
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms of the GNU General Public License version 2 as published by
+ * the Free Software Foundation.
+ */
+
+#include <linux/kernel.h>
+#include <linux/init.h>
+#include <linux/module.h>
+#include <linux/netlink.h>
+#include <linux/netfilter.h>
+#include <linux/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_dup_netdev.h>
+
+struct nft_fwd_netdev {
+ enum nft_registers sreg_dev:8;
+};
+
+static void nft_fwd_netdev_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ struct nft_fwd_netdev *priv = nft_expr_priv(expr);
+ int oif = regs->data[priv->sreg_dev];
+
+ nf_dup_netdev_egress(pkt, oif);
+ regs->verdict.code = NF_DROP;
+}
+
+static const struct nla_policy nft_fwd_netdev_policy[NFTA_FWD_MAX + 1] = {
+ [NFTA_FWD_SREG_DEV] = { .type = NLA_U32 },
+};
+
+static int nft_fwd_netdev_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_fwd_netdev *priv = nft_expr_priv(expr);
+
+ if (tb[NFTA_FWD_SREG_DEV] == NULL)
+ return -EINVAL;
+
+ priv->sreg_dev = nft_parse_register(tb[NFTA_FWD_SREG_DEV]);
+ return nft_validate_register_load(priv->sreg_dev, sizeof(int));
+}
+
+static const struct nft_expr_ops nft_fwd_netdev_ingress_ops;
+
+static int nft_fwd_netdev_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ struct nft_fwd_netdev *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_FWD_SREG_DEV, priv->sreg_dev))
+ goto nla_put_failure;
+
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static struct nft_expr_type nft_fwd_netdev_type;
+static const struct nft_expr_ops nft_fwd_netdev_ops = {
+ .type = &nft_fwd_netdev_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_fwd_netdev)),
+ .eval = nft_fwd_netdev_eval,
+ .init = nft_fwd_netdev_init,
+ .dump = nft_fwd_netdev_dump,
+};
+
+static struct nft_expr_type nft_fwd_netdev_type __read_mostly = {
+ .family = NFPROTO_NETDEV,
+ .name = "fwd",
+ .ops = &nft_fwd_netdev_ops,
+ .policy = nft_fwd_netdev_policy,
+ .maxattr = NFTA_FWD_MAX,
+ .owner = THIS_MODULE,
+};
+
+static int __init nft_fwd_netdev_module_init(void)
+{
+ return nft_register_expr(&nft_fwd_netdev_type);
+}
+
+static void __exit nft_fwd_netdev_module_exit(void)
+{
+ nft_unregister_expr(&nft_fwd_netdev_type);
+}
+
+module_init(nft_fwd_netdev_module_init);
+module_exit(nft_fwd_netdev_module_exit);
+
+MODULE_LICENSE("GPL");
+MODULE_AUTHOR("Pablo Neira Ayuso <pablo@netfilter.org>");
+MODULE_ALIAS_NFT_AF_EXPR(5, "fwd");
diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
index 5d67938f8b2f..99d18578afc6 100644
--- a/net/netfilter/nft_limit.c
+++ b/net/netfilter/nft_limit.c
@@ -26,6 +26,7 @@ struct nft_limit {
u64 rate;
u64 nsecs;
u32 burst;
+ bool invert;
};
static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost)
@@ -44,11 +45,11 @@ static inline bool nft_limit_eval(struct nft_limit *limit, u64 cost)
if (delta >= 0) {
limit->tokens = delta;
spin_unlock_bh(&limit_lock);
- return false;
+ return limit->invert;
}
limit->tokens = tokens;
spin_unlock_bh(&limit_lock);
- return true;
+ return !limit->invert;
}
static int nft_limit_init(struct nft_limit *limit,
@@ -78,6 +79,12 @@ static int nft_limit_init(struct nft_limit *limit,
limit->rate = rate;
}
+ if (tb[NFTA_LIMIT_FLAGS]) {
+ u32 flags = ntohl(nla_get_be32(tb[NFTA_LIMIT_FLAGS]));
+
+ if (flags & NFT_LIMIT_F_INV)
+ limit->invert = true;
+ }
limit->last = ktime_get_ns();
return 0;
@@ -86,13 +93,15 @@ static int nft_limit_init(struct nft_limit *limit,
static int nft_limit_dump(struct sk_buff *skb, const struct nft_limit *limit,
enum nft_limit_type type)
{
+ u32 flags = limit->invert ? NFT_LIMIT_F_INV : 0;
u64 secs = div_u64(limit->nsecs, NSEC_PER_SEC);
u64 rate = limit->rate - limit->burst;
if (nla_put_be64(skb, NFTA_LIMIT_RATE, cpu_to_be64(rate)) ||
nla_put_be64(skb, NFTA_LIMIT_UNIT, cpu_to_be64(secs)) ||
nla_put_be32(skb, NFTA_LIMIT_BURST, htonl(limit->burst)) ||
- nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)))
+ nla_put_be32(skb, NFTA_LIMIT_TYPE, htonl(type)) ||
+ nla_put_be32(skb, NFTA_LIMIT_FLAGS, htonl(flags)))
goto nla_put_failure;
return 0;
@@ -120,6 +129,7 @@ static const struct nla_policy nft_limit_policy[NFTA_LIMIT_MAX + 1] = {
[NFTA_LIMIT_UNIT] = { .type = NLA_U64 },
[NFTA_LIMIT_BURST] = { .type = NLA_U32 },
[NFTA_LIMIT_TYPE] = { .type = NLA_U32 },
+ [NFTA_LIMIT_FLAGS] = { .type = NLA_U32 },
};
static int nft_limit_pkts_init(const struct nft_ctx *ctx,
diff --git a/net/netfilter/nft_masq.c b/net/netfilter/nft_masq.c
index 9aea747b43ea..81b5ad6165ac 100644
--- a/net/netfilter/nft_masq.c
+++ b/net/netfilter/nft_masq.c
@@ -17,7 +17,9 @@
#include <net/netfilter/nft_masq.h>
const struct nla_policy nft_masq_policy[NFTA_MASQ_MAX + 1] = {
- [NFTA_MASQ_FLAGS] = { .type = NLA_U32 },
+ [NFTA_MASQ_FLAGS] = { .type = NLA_U32 },
+ [NFTA_MASQ_REG_PROTO_MIN] = { .type = NLA_U32 },
+ [NFTA_MASQ_REG_PROTO_MAX] = { .type = NLA_U32 },
};
EXPORT_SYMBOL_GPL(nft_masq_policy);
@@ -40,6 +42,7 @@ int nft_masq_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
{
+ u32 plen = FIELD_SIZEOF(struct nf_nat_range, min_addr.all);
struct nft_masq *priv = nft_expr_priv(expr);
int err;
@@ -47,12 +50,32 @@ int nft_masq_init(const struct nft_ctx *ctx,
if (err)
return err;
- if (tb[NFTA_MASQ_FLAGS] == NULL)
- return 0;
-
- priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
- if (priv->flags & ~NF_NAT_RANGE_MASK)
- return -EINVAL;
+ if (tb[NFTA_MASQ_FLAGS]) {
+ priv->flags = ntohl(nla_get_be32(tb[NFTA_MASQ_FLAGS]));
+ if (priv->flags & ~NF_NAT_RANGE_MASK)
+ return -EINVAL;
+ }
+
+ if (tb[NFTA_MASQ_REG_PROTO_MIN]) {
+ priv->sreg_proto_min =
+ nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MIN]);
+
+ err = nft_validate_register_load(priv->sreg_proto_min, plen);
+ if (err < 0)
+ return err;
+
+ if (tb[NFTA_MASQ_REG_PROTO_MAX]) {
+ priv->sreg_proto_max =
+ nft_parse_register(tb[NFTA_MASQ_REG_PROTO_MAX]);
+
+ err = nft_validate_register_load(priv->sreg_proto_max,
+ plen);
+ if (err < 0)
+ return err;
+ } else {
+ priv->sreg_proto_max = priv->sreg_proto_min;
+ }
+ }
return 0;
}
@@ -62,12 +85,18 @@ int nft_masq_dump(struct sk_buff *skb, const struct nft_expr *expr)
{
const struct nft_masq *priv = nft_expr_priv(expr);
- if (priv->flags == 0)
- return 0;
-
- if (nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags)))
+ if (priv->flags != 0 &&
+ nla_put_be32(skb, NFTA_MASQ_FLAGS, htonl(priv->flags)))
goto nla_put_failure;
+ if (priv->sreg_proto_min) {
+ if (nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MIN,
+ priv->sreg_proto_min) ||
+ nft_dump_register(skb, NFTA_MASQ_REG_PROTO_MAX,
+ priv->sreg_proto_max))
+ goto nla_put_failure;
+ }
+
return 0;
nla_put_failure:
diff --git a/net/netfilter/nft_meta.c b/net/netfilter/nft_meta.c
index e4ad2c24bc41..16c50b0dd426 100644
--- a/net/netfilter/nft_meta.c
+++ b/net/netfilter/nft_meta.c
@@ -18,12 +18,18 @@
#include <linux/ip.h>
#include <linux/ipv6.h>
#include <linux/smp.h>
+#include <linux/static_key.h>
#include <net/dst.h>
#include <net/sock.h>
#include <net/tcp_states.h> /* for TCP_TIME_WAIT */
#include <net/netfilter/nf_tables.h>
+#include <net/netfilter/nf_tables_core.h>
#include <net/netfilter/nft_meta.h>
+#include <uapi/linux/netfilter_bridge.h> /* NF_BR_PRE_ROUTING */
+
+static DEFINE_PER_CPU(struct rnd_state, nft_prandom_state);
+
void nft_meta_get_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -31,6 +37,7 @@ void nft_meta_get_eval(const struct nft_expr *expr,
const struct nft_meta *priv = nft_expr_priv(expr);
const struct sk_buff *skb = pkt->skb;
const struct net_device *in = pkt->in, *out = pkt->out;
+ struct sock *sk;
u32 *dest = &regs->data[priv->dreg];
switch (priv->key) {
@@ -86,33 +93,35 @@ void nft_meta_get_eval(const struct nft_expr *expr,
*(u16 *)dest = out->type;
break;
case NFT_META_SKUID:
- if (skb->sk == NULL || !sk_fullsock(skb->sk))
+ sk = skb_to_full_sk(skb);
+ if (!sk || !sk_fullsock(sk))
goto err;
- read_lock_bh(&skb->sk->sk_callback_lock);
- if (skb->sk->sk_socket == NULL ||
- skb->sk->sk_socket->file == NULL) {
- read_unlock_bh(&skb->sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket == NULL ||
+ sk->sk_socket->file == NULL) {
+ read_unlock_bh(&sk->sk_callback_lock);
goto err;
}
*dest = from_kuid_munged(&init_user_ns,
- skb->sk->sk_socket->file->f_cred->fsuid);
- read_unlock_bh(&skb->sk->sk_callback_lock);
+ sk->sk_socket->file->f_cred->fsuid);
+ read_unlock_bh(&sk->sk_callback_lock);
break;
case NFT_META_SKGID:
- if (skb->sk == NULL || !sk_fullsock(skb->sk))
+ sk = skb_to_full_sk(skb);
+ if (!sk || !sk_fullsock(sk))
goto err;
- read_lock_bh(&skb->sk->sk_callback_lock);
- if (skb->sk->sk_socket == NULL ||
- skb->sk->sk_socket->file == NULL) {
- read_unlock_bh(&skb->sk->sk_callback_lock);
+ read_lock_bh(&sk->sk_callback_lock);
+ if (sk->sk_socket == NULL ||
+ sk->sk_socket->file == NULL) {
+ read_unlock_bh(&sk->sk_callback_lock);
goto err;
}
*dest = from_kgid_munged(&init_user_ns,
- skb->sk->sk_socket->file->f_cred->fsgid);
- read_unlock_bh(&skb->sk->sk_callback_lock);
+ sk->sk_socket->file->f_cred->fsgid);
+ read_unlock_bh(&sk->sk_callback_lock);
break;
#ifdef CONFIG_IP_ROUTE_CLASSID
case NFT_META_RTCLASSID: {
@@ -168,11 +177,17 @@ void nft_meta_get_eval(const struct nft_expr *expr,
break;
#ifdef CONFIG_CGROUP_NET_CLASSID
case NFT_META_CGROUP:
- if (skb->sk == NULL || !sk_fullsock(skb->sk))
+ sk = skb_to_full_sk(skb);
+ if (!sk || !sk_fullsock(sk))
goto err;
- *dest = skb->sk->sk_classid;
+ *dest = sock_cgroup_classid(&sk->sk_cgrp_data);
break;
#endif
+ case NFT_META_PRANDOM: {
+ struct rnd_state *state = this_cpu_ptr(&nft_prandom_state);
+ *dest = prandom_u32_state(state);
+ break;
+ }
default:
WARN_ON(1);
goto err;
@@ -184,6 +199,13 @@ err:
}
EXPORT_SYMBOL_GPL(nft_meta_get_eval);
+/* don't change or set _LOOPBACK, _USER, etc. */
+static bool pkt_type_ok(u32 p)
+{
+ return p == PACKET_HOST || p == PACKET_BROADCAST ||
+ p == PACKET_MULTICAST || p == PACKET_OTHERHOST;
+}
+
void nft_meta_set_eval(const struct nft_expr *expr,
struct nft_regs *regs,
const struct nft_pktinfo *pkt)
@@ -199,6 +221,11 @@ void nft_meta_set_eval(const struct nft_expr *expr,
case NFT_META_PRIORITY:
skb->priority = value;
break;
+ case NFT_META_PKTTYPE:
+ if (skb->pkt_type != value &&
+ pkt_type_ok(value) && pkt_type_ok(skb->pkt_type))
+ skb->pkt_type = value;
+ break;
case NFT_META_NFTRACE:
skb->nf_trace = 1;
break;
@@ -257,6 +284,10 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
case NFT_META_OIFNAME:
len = IFNAMSIZ;
break;
+ case NFT_META_PRANDOM:
+ prandom_init_once(&nft_prandom_state);
+ len = sizeof(u32);
+ break;
default:
return -EOPNOTSUPP;
}
@@ -267,6 +298,24 @@ int nft_meta_get_init(const struct nft_ctx *ctx,
}
EXPORT_SYMBOL_GPL(nft_meta_get_init);
+static int nft_meta_set_init_pkttype(const struct nft_ctx *ctx)
+{
+ unsigned int hooks;
+
+ switch (ctx->afi->family) {
+ case NFPROTO_BRIDGE:
+ hooks = 1 << NF_BR_PRE_ROUTING;
+ break;
+ case NFPROTO_NETDEV:
+ hooks = 1 << NF_NETDEV_INGRESS;
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return nft_chain_validate_hooks(ctx->chain, hooks);
+}
+
int nft_meta_set_init(const struct nft_ctx *ctx,
const struct nft_expr *expr,
const struct nlattr * const tb[])
@@ -284,6 +333,12 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
case NFT_META_NFTRACE:
len = sizeof(u8);
break;
+ case NFT_META_PKTTYPE:
+ err = nft_meta_set_init_pkttype(ctx);
+ if (err)
+ return err;
+ len = sizeof(u8);
+ break;
default:
return -EOPNOTSUPP;
}
@@ -293,6 +348,9 @@ int nft_meta_set_init(const struct nft_ctx *ctx,
if (err < 0)
return err;
+ if (priv->key == NFT_META_NFTRACE)
+ static_branch_inc(&nft_trace_enabled);
+
return 0;
}
EXPORT_SYMBOL_GPL(nft_meta_set_init);
@@ -330,6 +388,16 @@ nla_put_failure:
}
EXPORT_SYMBOL_GPL(nft_meta_set_dump);
+void nft_meta_set_destroy(const struct nft_ctx *ctx,
+ const struct nft_expr *expr)
+{
+ const struct nft_meta *priv = nft_expr_priv(expr);
+
+ if (priv->key == NFT_META_NFTRACE)
+ static_branch_dec(&nft_trace_enabled);
+}
+EXPORT_SYMBOL_GPL(nft_meta_set_destroy);
+
static struct nft_expr_type nft_meta_type;
static const struct nft_expr_ops nft_meta_get_ops = {
.type = &nft_meta_type,
@@ -344,6 +412,7 @@ static const struct nft_expr_ops nft_meta_set_ops = {
.size = NFT_EXPR_SIZE(sizeof(struct nft_meta)),
.eval = nft_meta_set_eval,
.init = nft_meta_set_init,
+ .destroy = nft_meta_set_destroy,
.dump = nft_meta_set_dump,
};
diff --git a/net/netfilter/nft_payload.c b/net/netfilter/nft_payload.c
index 09b4b07eb676..12cd4bf16d17 100644
--- a/net/netfilter/nft_payload.c
+++ b/net/netfilter/nft_payload.c
@@ -107,10 +107,13 @@ err:
}
static const struct nla_policy nft_payload_policy[NFTA_PAYLOAD_MAX + 1] = {
- [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 },
- [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_SREG] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_DREG] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_BASE] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_OFFSET] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_LEN] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_CSUM_TYPE] = { .type = NLA_U32 },
+ [NFTA_PAYLOAD_CSUM_OFFSET] = { .type = NLA_U32 },
};
static int nft_payload_init(const struct nft_ctx *ctx,
@@ -160,6 +163,118 @@ const struct nft_expr_ops nft_payload_fast_ops = {
.dump = nft_payload_dump,
};
+static void nft_payload_set_eval(const struct nft_expr *expr,
+ struct nft_regs *regs,
+ const struct nft_pktinfo *pkt)
+{
+ const struct nft_payload_set *priv = nft_expr_priv(expr);
+ struct sk_buff *skb = pkt->skb;
+ const u32 *src = &regs->data[priv->sreg];
+ int offset, csum_offset;
+ __wsum fsum, tsum;
+ __sum16 sum;
+
+ switch (priv->base) {
+ case NFT_PAYLOAD_LL_HEADER:
+ if (!skb_mac_header_was_set(skb))
+ goto err;
+ offset = skb_mac_header(skb) - skb->data;
+ break;
+ case NFT_PAYLOAD_NETWORK_HEADER:
+ offset = skb_network_offset(skb);
+ break;
+ case NFT_PAYLOAD_TRANSPORT_HEADER:
+ offset = pkt->xt.thoff;
+ break;
+ default:
+ BUG();
+ }
+
+ csum_offset = offset + priv->csum_offset;
+ offset += priv->offset;
+
+ if (priv->csum_type == NFT_PAYLOAD_CSUM_INET &&
+ (priv->base != NFT_PAYLOAD_TRANSPORT_HEADER ||
+ skb->ip_summed != CHECKSUM_PARTIAL)) {
+ if (skb_copy_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
+ goto err;
+
+ fsum = skb_checksum(skb, offset, priv->len, 0);
+ tsum = csum_partial(src, priv->len, 0);
+ sum = csum_fold(csum_add(csum_sub(~csum_unfold(sum), fsum),
+ tsum));
+ if (sum == 0)
+ sum = CSUM_MANGLED_0;
+
+ if (!skb_make_writable(skb, csum_offset + sizeof(sum)) ||
+ skb_store_bits(skb, csum_offset, &sum, sizeof(sum)) < 0)
+ goto err;
+ }
+
+ if (!skb_make_writable(skb, max(offset + priv->len, 0)) ||
+ skb_store_bits(skb, offset, src, priv->len) < 0)
+ goto err;
+
+ return;
+err:
+ regs->verdict.code = NFT_BREAK;
+}
+
+static int nft_payload_set_init(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nlattr * const tb[])
+{
+ struct nft_payload_set *priv = nft_expr_priv(expr);
+
+ priv->base = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_BASE]));
+ priv->offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
+ priv->len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
+ priv->sreg = nft_parse_register(tb[NFTA_PAYLOAD_SREG]);
+
+ if (tb[NFTA_PAYLOAD_CSUM_TYPE])
+ priv->csum_type =
+ ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_TYPE]));
+ if (tb[NFTA_PAYLOAD_CSUM_OFFSET])
+ priv->csum_offset =
+ ntohl(nla_get_be32(tb[NFTA_PAYLOAD_CSUM_OFFSET]));
+
+ switch (priv->csum_type) {
+ case NFT_PAYLOAD_CSUM_NONE:
+ case NFT_PAYLOAD_CSUM_INET:
+ break;
+ default:
+ return -EOPNOTSUPP;
+ }
+
+ return nft_validate_register_load(priv->sreg, priv->len);
+}
+
+static int nft_payload_set_dump(struct sk_buff *skb, const struct nft_expr *expr)
+{
+ const struct nft_payload_set *priv = nft_expr_priv(expr);
+
+ if (nft_dump_register(skb, NFTA_PAYLOAD_SREG, priv->sreg) ||
+ nla_put_be32(skb, NFTA_PAYLOAD_BASE, htonl(priv->base)) ||
+ nla_put_be32(skb, NFTA_PAYLOAD_OFFSET, htonl(priv->offset)) ||
+ nla_put_be32(skb, NFTA_PAYLOAD_LEN, htonl(priv->len)) ||
+ nla_put_be32(skb, NFTA_PAYLOAD_CSUM_TYPE, htonl(priv->csum_type)) ||
+ nla_put_be32(skb, NFTA_PAYLOAD_CSUM_OFFSET,
+ htonl(priv->csum_offset)))
+ goto nla_put_failure;
+ return 0;
+
+nla_put_failure:
+ return -1;
+}
+
+static const struct nft_expr_ops nft_payload_set_ops = {
+ .type = &nft_payload_type,
+ .size = NFT_EXPR_SIZE(sizeof(struct nft_payload_set)),
+ .eval = nft_payload_set_eval,
+ .init = nft_payload_set_init,
+ .dump = nft_payload_set_dump,
+};
+
static const struct nft_expr_ops *
nft_payload_select_ops(const struct nft_ctx *ctx,
const struct nlattr * const tb[])
@@ -167,8 +282,7 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
enum nft_payload_bases base;
unsigned int offset, len;
- if (tb[NFTA_PAYLOAD_DREG] == NULL ||
- tb[NFTA_PAYLOAD_BASE] == NULL ||
+ if (tb[NFTA_PAYLOAD_BASE] == NULL ||
tb[NFTA_PAYLOAD_OFFSET] == NULL ||
tb[NFTA_PAYLOAD_LEN] == NULL)
return ERR_PTR(-EINVAL);
@@ -183,6 +297,15 @@ nft_payload_select_ops(const struct nft_ctx *ctx,
return ERR_PTR(-EOPNOTSUPP);
}
+ if (tb[NFTA_PAYLOAD_SREG] != NULL) {
+ if (tb[NFTA_PAYLOAD_DREG] != NULL)
+ return ERR_PTR(-EINVAL);
+ return &nft_payload_set_ops;
+ }
+
+ if (tb[NFTA_PAYLOAD_DREG] == NULL)
+ return ERR_PTR(-EINVAL);
+
offset = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_OFFSET]));
len = ntohl(nla_get_be32(tb[NFTA_PAYLOAD_LEN]));
diff --git a/net/netfilter/x_tables.c b/net/netfilter/x_tables.c
index d4aaad747ea9..582c9cfd6567 100644
--- a/net/netfilter/x_tables.c
+++ b/net/netfilter/x_tables.c
@@ -26,6 +26,7 @@
#include <linux/mm.h>
#include <linux/slab.h>
#include <linux/audit.h>
+#include <linux/user_namespace.h>
#include <net/net_namespace.h>
#include <linux/netfilter/x_tables.h>
@@ -658,6 +659,9 @@ struct xt_table_info *xt_alloc_table_info(unsigned int size)
struct xt_table_info *info = NULL;
size_t sz = sizeof(*info) + size;
+ if (sz < sizeof(*info))
+ return NULL;
+
/* Pedantry: prevent them from hitting BUG() in vmalloc.c --RR */
if ((SMP_ALIGN(size) >> PAGE_SHIFT) + 2 > totalram_pages)
return NULL;
@@ -693,12 +697,45 @@ EXPORT_SYMBOL(xt_free_table_info);
struct xt_table *xt_find_table_lock(struct net *net, u_int8_t af,
const char *name)
{
- struct xt_table *t;
+ struct xt_table *t, *found = NULL;
mutex_lock(&xt[af].mutex);
list_for_each_entry(t, &net->xt.tables[af], list)
if (strcmp(t->name, name) == 0 && try_module_get(t->me))
return t;
+
+ if (net == &init_net)
+ goto out;
+
+ /* Table doesn't exist in this netns, re-try init */
+ list_for_each_entry(t, &init_net.xt.tables[af], list) {
+ if (strcmp(t->name, name))
+ continue;
+ if (!try_module_get(t->me))
+ return NULL;
+
+ mutex_unlock(&xt[af].mutex);
+ if (t->table_init(net) != 0) {
+ module_put(t->me);
+ return NULL;
+ }
+
+ found = t;
+
+ mutex_lock(&xt[af].mutex);
+ break;
+ }
+
+ if (!found)
+ goto out;
+
+ /* and once again: */
+ list_for_each_entry(t, &net->xt.tables[af], list)
+ if (strcmp(t->name, name) == 0)
+ return t;
+
+ module_put(found->me);
+ out:
mutex_unlock(&xt[af].mutex);
return NULL;
}
@@ -1169,20 +1206,20 @@ static const struct file_operations xt_target_ops = {
#endif /* CONFIG_PROC_FS */
/**
- * xt_hook_link - set up hooks for a new table
+ * xt_hook_ops_alloc - set up hooks for a new table
* @table: table with metadata needed to set up hooks
* @fn: Hook function
*
- * This function will take care of creating and registering the necessary
- * Netfilter hooks for XT tables.
+ * This function will create the nf_hook_ops that the x_table needs
+ * to hand to xt_hook_link_net().
*/
-struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn)
+struct nf_hook_ops *
+xt_hook_ops_alloc(const struct xt_table *table, nf_hookfn *fn)
{
unsigned int hook_mask = table->valid_hooks;
uint8_t i, num_hooks = hweight32(hook_mask);
uint8_t hooknum;
struct nf_hook_ops *ops;
- int ret;
ops = kmalloc(sizeof(*ops) * num_hooks, GFP_KERNEL);
if (ops == NULL)
@@ -1199,33 +1236,17 @@ struct nf_hook_ops *xt_hook_link(const struct xt_table *table, nf_hookfn *fn)
++i;
}
- ret = nf_register_hooks(ops, num_hooks);
- if (ret < 0) {
- kfree(ops);
- return ERR_PTR(ret);
- }
-
return ops;
}
-EXPORT_SYMBOL_GPL(xt_hook_link);
-
-/**
- * xt_hook_unlink - remove hooks for a table
- * @ops: nf_hook_ops array as returned by nf_hook_link
- * @hook_mask: the very same mask that was passed to nf_hook_link
- */
-void xt_hook_unlink(const struct xt_table *table, struct nf_hook_ops *ops)
-{
- nf_unregister_hooks(ops, hweight32(table->valid_hooks));
- kfree(ops);
-}
-EXPORT_SYMBOL_GPL(xt_hook_unlink);
+EXPORT_SYMBOL_GPL(xt_hook_ops_alloc);
int xt_proto_init(struct net *net, u_int8_t af)
{
#ifdef CONFIG_PROC_FS
char buf[XT_FUNCTION_MAXNAMELEN];
struct proc_dir_entry *proc;
+ kuid_t root_uid;
+ kgid_t root_gid;
#endif
if (af >= ARRAY_SIZE(xt_prefix))
@@ -1233,12 +1254,17 @@ int xt_proto_init(struct net *net, u_int8_t af)
#ifdef CONFIG_PROC_FS
+ root_uid = make_kuid(net->user_ns, 0);
+ root_gid = make_kgid(net->user_ns, 0);
+
strlcpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TABLES, sizeof(buf));
proc = proc_create_data(buf, 0440, net->proc_net, &xt_table_ops,
(void *)(unsigned long)af);
if (!proc)
goto out;
+ if (uid_valid(root_uid) && gid_valid(root_gid))
+ proc_set_user(proc, root_uid, root_gid);
strlcpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_MATCHES, sizeof(buf));
@@ -1246,6 +1272,8 @@ int xt_proto_init(struct net *net, u_int8_t af)
(void *)(unsigned long)af);
if (!proc)
goto out_remove_tables;
+ if (uid_valid(root_uid) && gid_valid(root_gid))
+ proc_set_user(proc, root_uid, root_gid);
strlcpy(buf, xt_prefix[af], sizeof(buf));
strlcat(buf, FORMAT_TARGETS, sizeof(buf));
@@ -1253,6 +1281,8 @@ int xt_proto_init(struct net *net, u_int8_t af)
(void *)(unsigned long)af);
if (!proc)
goto out_remove_matches;
+ if (uid_valid(root_uid) && gid_valid(root_gid))
+ proc_set_user(proc, root_uid, root_gid);
#endif
return 0;
diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index e7ac07e53b59..6669e68d589e 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -143,7 +143,7 @@ xt_ct_set_timeout(struct nf_conn *ct, const struct xt_tgchk_param *par,
goto out;
}
- timeout = timeout_find_get(timeout_name);
+ timeout = timeout_find_get(par->net, timeout_name);
if (timeout == NULL) {
ret = -ENOENT;
pr_info("No such timeout policy \"%s\"\n", timeout_name);
diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index b7c43def0dc6..e118397254af 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -228,7 +228,7 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
struct ipv6hdr *ipv6h = ipv6_hdr(skb);
u8 nexthdr;
- __be16 frag_off;
+ __be16 frag_off, oldlen, newlen;
int tcphoff;
int ret;
@@ -244,7 +244,12 @@ tcpmss_tg6(struct sk_buff *skb, const struct xt_action_param *par)
return NF_DROP;
if (ret > 0) {
ipv6h = ipv6_hdr(skb);
- ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) + ret);
+ oldlen = ipv6h->payload_len;
+ newlen = htons(ntohs(oldlen) + ret);
+ if (skb->ip_summed == CHECKSUM_COMPLETE)
+ skb->csum = csum_add(csum_sub(skb->csum, oldlen),
+ newlen);
+ ipv6h->payload_len = newlen;
}
return XT_CONTINUE;
}
diff --git a/net/netfilter/xt_TEE.c b/net/netfilter/xt_TEE.c
index 899b06115fc5..6e57a3966dc5 100644
--- a/net/netfilter/xt_TEE.c
+++ b/net/netfilter/xt_TEE.c
@@ -31,19 +31,21 @@ static unsigned int
tee_tg4(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tee_tginfo *info = par->targinfo;
+ int oif = info->priv ? info->priv->oif : 0;
- nf_dup_ipv4(par->net, skb, par->hooknum, &info->gw.in, info->priv->oif);
+ nf_dup_ipv4(par->net, skb, par->hooknum, &info->gw.in, oif);
return XT_CONTINUE;
}
-#if IS_ENABLED(CONFIG_NF_DUP_IPV6)
+#if IS_ENABLED(CONFIG_IPV6)
static unsigned int
tee_tg6(struct sk_buff *skb, const struct xt_action_param *par)
{
const struct xt_tee_tginfo *info = par->targinfo;
+ int oif = info->priv ? info->priv->oif : 0;
- nf_dup_ipv6(par->net, skb, par->hooknum, &info->gw.in6, info->priv->oif);
+ nf_dup_ipv6(par->net, skb, par->hooknum, &info->gw.in6, oif);
return XT_CONTINUE;
}
@@ -129,7 +131,7 @@ static struct xt_target tee_tg_reg[] __read_mostly = {
.destroy = tee_tg_destroy,
.me = THIS_MODULE,
},
-#if IS_ENABLED(CONFIG_NF_DUP_IPV6)
+#if IS_ENABLED(CONFIG_IPV6)
{
.name = "TEE",
.revision = 1,
diff --git a/net/netfilter/xt_TPROXY.c b/net/netfilter/xt_TPROXY.c
index 3ab591e73ec0..7f4414d26a66 100644
--- a/net/netfilter/xt_TPROXY.c
+++ b/net/netfilter/xt_TPROXY.c
@@ -105,19 +105,24 @@ tproxy_laddr4(struct sk_buff *skb, __be32 user_laddr, __be32 daddr)
* belonging to established connections going through that one.
*/
static inline struct sock *
-nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
+nf_tproxy_get_sock_v4(struct net *net, struct sk_buff *skb, void *hp,
+ const u8 protocol,
const __be32 saddr, const __be32 daddr,
const __be16 sport, const __be16 dport,
const struct net_device *in,
const enum nf_tproxy_lookup_t lookup_type)
{
struct sock *sk;
+ struct tcphdr *tcph;
switch (protocol) {
case IPPROTO_TCP:
switch (lookup_type) {
case NFT_LOOKUP_LISTENER:
- sk = inet_lookup_listener(net, &tcp_hashinfo,
+ tcph = hp;
+ sk = inet_lookup_listener(net, &tcp_hashinfo, skb,
+ ip_hdrlen(skb) +
+ __tcp_hdrlen(tcph),
saddr, sport,
daddr, dport,
in->ifindex);
@@ -169,19 +174,23 @@ nf_tproxy_get_sock_v4(struct net *net, const u8 protocol,
#ifdef XT_TPROXY_HAVE_IPV6
static inline struct sock *
-nf_tproxy_get_sock_v6(struct net *net, const u8 protocol,
+nf_tproxy_get_sock_v6(struct net *net, struct sk_buff *skb, int thoff, void *hp,
+ const u8 protocol,
const struct in6_addr *saddr, const struct in6_addr *daddr,
const __be16 sport, const __be16 dport,
const struct net_device *in,
const enum nf_tproxy_lookup_t lookup_type)
{
struct sock *sk;
+ struct tcphdr *tcph;
switch (protocol) {
case IPPROTO_TCP:
switch (lookup_type) {
case NFT_LOOKUP_LISTENER:
- sk = inet6_lookup_listener(net, &tcp_hashinfo,
+ tcph = hp;
+ sk = inet6_lookup_listener(net, &tcp_hashinfo, skb,
+ thoff + __tcp_hdrlen(tcph),
saddr, sport,
daddr, ntohs(dport),
in->ifindex);
@@ -267,7 +276,7 @@ tproxy_handle_time_wait4(struct net *net, struct sk_buff *skb,
* to a listener socket if there's one */
struct sock *sk2;
- sk2 = nf_tproxy_get_sock_v4(net, iph->protocol,
+ sk2 = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
iph->saddr, laddr ? laddr : iph->daddr,
hp->source, lport ? lport : hp->dest,
skb->dev, NFT_LOOKUP_LISTENER);
@@ -305,7 +314,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
* addresses, this happens if the redirect already happened
* and the current packet belongs to an already established
* connection */
- sk = nf_tproxy_get_sock_v4(net, iph->protocol,
+ sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
iph->saddr, iph->daddr,
hp->source, hp->dest,
skb->dev, NFT_LOOKUP_ESTABLISHED);
@@ -321,7 +330,7 @@ tproxy_tg4(struct net *net, struct sk_buff *skb, __be32 laddr, __be16 lport,
else if (!sk)
/* no, there's no established connection, check if
* there's a listener on the redirected addr/port */
- sk = nf_tproxy_get_sock_v4(net, iph->protocol,
+ sk = nf_tproxy_get_sock_v4(net, skb, hp, iph->protocol,
iph->saddr, laddr,
hp->source, lport,
skb->dev, NFT_LOOKUP_LISTENER);
@@ -429,7 +438,7 @@ tproxy_handle_time_wait6(struct sk_buff *skb, int tproto, int thoff,
* to a listener socket if there's one */
struct sock *sk2;
- sk2 = nf_tproxy_get_sock_v6(par->net, tproto,
+ sk2 = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto,
&iph->saddr,
tproxy_laddr6(skb, &tgi->laddr.in6, &iph->daddr),
hp->source,
@@ -472,7 +481,7 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
* addresses, this happens if the redirect already happened
* and the current packet belongs to an already established
* connection */
- sk = nf_tproxy_get_sock_v6(par->net, tproto,
+ sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp, tproto,
&iph->saddr, &iph->daddr,
hp->source, hp->dest,
par->in, NFT_LOOKUP_ESTABLISHED);
@@ -487,8 +496,8 @@ tproxy_tg6_v1(struct sk_buff *skb, const struct xt_action_param *par)
else if (!sk)
/* no there's no established connection, check if
* there's a listener on the redirected addr/port */
- sk = nf_tproxy_get_sock_v6(par->net, tproto,
- &iph->saddr, laddr,
+ sk = nf_tproxy_get_sock_v6(par->net, skb, thoff, hp,
+ tproto, &iph->saddr, laddr,
hp->source, lport,
par->in, NFT_LOOKUP_LISTENER);
diff --git a/net/netfilter/xt_cgroup.c b/net/netfilter/xt_cgroup.c
index a1d126f29463..a086a914865f 100644
--- a/net/netfilter/xt_cgroup.c
+++ b/net/netfilter/xt_cgroup.c
@@ -24,9 +24,9 @@ MODULE_DESCRIPTION("Xtables: process control group matching");
MODULE_ALIAS("ipt_cgroup");
MODULE_ALIAS("ip6t_cgroup");
-static int cgroup_mt_check(const struct xt_mtchk_param *par)
+static int cgroup_mt_check_v0(const struct xt_mtchk_param *par)
{
- struct xt_cgroup_info *info = par->matchinfo;
+ struct xt_cgroup_info_v0 *info = par->matchinfo;
if (info->invert & ~1)
return -EINVAL;
@@ -34,38 +34,110 @@ static int cgroup_mt_check(const struct xt_mtchk_param *par)
return 0;
}
+static int cgroup_mt_check_v1(const struct xt_mtchk_param *par)
+{
+ struct xt_cgroup_info_v1 *info = par->matchinfo;
+ struct cgroup *cgrp;
+
+ if ((info->invert_path & ~1) || (info->invert_classid & ~1))
+ return -EINVAL;
+
+ if (!info->has_path && !info->has_classid) {
+ pr_info("xt_cgroup: no path or classid specified\n");
+ return -EINVAL;
+ }
+
+ if (info->has_path && info->has_classid) {
+ pr_info("xt_cgroup: both path and classid specified\n");
+ return -EINVAL;
+ }
+
+ if (info->has_path) {
+ cgrp = cgroup_get_from_path(info->path);
+ if (IS_ERR(cgrp)) {
+ pr_info("xt_cgroup: invalid path, errno=%ld\n",
+ PTR_ERR(cgrp));
+ return -EINVAL;
+ }
+ info->priv = cgrp;
+ }
+
+ return 0;
+}
+
static bool
-cgroup_mt(const struct sk_buff *skb, struct xt_action_param *par)
+cgroup_mt_v0(const struct sk_buff *skb, struct xt_action_param *par)
{
- const struct xt_cgroup_info *info = par->matchinfo;
+ const struct xt_cgroup_info_v0 *info = par->matchinfo;
if (skb->sk == NULL || !sk_fullsock(skb->sk))
return false;
- return (info->id == skb->sk->sk_classid) ^ info->invert;
+ return (info->id == sock_cgroup_classid(&skb->sk->sk_cgrp_data)) ^
+ info->invert;
+}
+
+static bool cgroup_mt_v1(const struct sk_buff *skb, struct xt_action_param *par)
+{
+ const struct xt_cgroup_info_v1 *info = par->matchinfo;
+ struct sock_cgroup_data *skcd = &skb->sk->sk_cgrp_data;
+ struct cgroup *ancestor = info->priv;
+
+ if (!skb->sk || !sk_fullsock(skb->sk))
+ return false;
+
+ if (ancestor)
+ return cgroup_is_descendant(sock_cgroup_ptr(skcd), ancestor) ^
+ info->invert_path;
+ else
+ return (info->classid == sock_cgroup_classid(skcd)) ^
+ info->invert_classid;
+}
+
+static void cgroup_mt_destroy_v1(const struct xt_mtdtor_param *par)
+{
+ struct xt_cgroup_info_v1 *info = par->matchinfo;
+
+ if (info->priv)
+ cgroup_put(info->priv);
}
-static struct xt_match cgroup_mt_reg __read_mostly = {
- .name = "cgroup",
- .revision = 0,
- .family = NFPROTO_UNSPEC,
- .checkentry = cgroup_mt_check,
- .match = cgroup_mt,
- .matchsize = sizeof(struct xt_cgroup_info),
- .me = THIS_MODULE,
- .hooks = (1 << NF_INET_LOCAL_OUT) |
- (1 << NF_INET_POST_ROUTING) |
- (1 << NF_INET_LOCAL_IN),
+static struct xt_match cgroup_mt_reg[] __read_mostly = {
+ {
+ .name = "cgroup",
+ .revision = 0,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = cgroup_mt_check_v0,
+ .match = cgroup_mt_v0,
+ .matchsize = sizeof(struct xt_cgroup_info_v0),
+ .me = THIS_MODULE,
+ .hooks = (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ },
+ {
+ .name = "cgroup",
+ .revision = 1,
+ .family = NFPROTO_UNSPEC,
+ .checkentry = cgroup_mt_check_v1,
+ .match = cgroup_mt_v1,
+ .matchsize = sizeof(struct xt_cgroup_info_v1),
+ .destroy = cgroup_mt_destroy_v1,
+ .me = THIS_MODULE,
+ .hooks = (1 << NF_INET_LOCAL_OUT) |
+ (1 << NF_INET_POST_ROUTING) |
+ (1 << NF_INET_LOCAL_IN),
+ },
};
static int __init cgroup_mt_init(void)
{
- return xt_register_match(&cgroup_mt_reg);
+ return xt_register_matches(cgroup_mt_reg, ARRAY_SIZE(cgroup_mt_reg));
}
static void __exit cgroup_mt_exit(void)
{
- xt_unregister_match(&cgroup_mt_reg);
+ xt_unregister_matches(cgroup_mt_reg, ARRAY_SIZE(cgroup_mt_reg));
}
module_init(cgroup_mt_init);
diff --git a/net/netfilter/xt_osf.c b/net/netfilter/xt_osf.c
index df8801e02a32..2455b69b5810 100644
--- a/net/netfilter/xt_osf.c
+++ b/net/netfilter/xt_osf.c
@@ -61,8 +61,8 @@ static const struct nla_policy xt_osf_policy[OSF_ATTR_MAX + 1] = {
[OSF_ATTR_FINGER] = { .len = sizeof(struct xt_osf_user_finger) },
};
-static int xt_osf_add_callback(struct sock *ctnl, struct sk_buff *skb,
- const struct nlmsghdr *nlh,
+static int xt_osf_add_callback(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb, const struct nlmsghdr *nlh,
const struct nlattr * const osf_attrs[])
{
struct xt_osf_user_finger *f;
@@ -104,7 +104,8 @@ static int xt_osf_add_callback(struct sock *ctnl, struct sk_buff *skb,
return err;
}
-static int xt_osf_remove_callback(struct sock *ctnl, struct sk_buff *skb,
+static int xt_osf_remove_callback(struct net *net, struct sock *ctnl,
+ struct sk_buff *skb,
const struct nlmsghdr *nlh,
const struct nlattr * const osf_attrs[])
{
@@ -261,7 +262,6 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
if (f->opt[optnum].kind == (*optp)) {
__u32 len = f->opt[optnum].length;
const __u8 *optend = optp + len;
- int loop_cont = 0;
fmatch = FMATCH_OK;
@@ -274,7 +274,6 @@ xt_osf_match_packet(const struct sk_buff *skb, struct xt_action_param *p)
mss = ntohs((__force __be16)mss);
break;
case OSFOPT_TS:
- loop_cont = 1;
break;
}
diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index ca2e577ed8ac..1302b475abcb 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -14,6 +14,7 @@
#include <linux/skbuff.h>
#include <linux/file.h>
#include <net/sock.h>
+#include <net/inet_sock.h>
#include <linux/netfilter/x_tables.h>
#include <linux/netfilter/xt_owner.h>
@@ -33,8 +34,9 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
{
const struct xt_owner_match_info *info = par->matchinfo;
const struct file *filp;
+ struct sock *sk = skb_to_full_sk(skb);
- if (skb->sk == NULL || skb->sk->sk_socket == NULL)
+ if (sk == NULL || sk->sk_socket == NULL)
return (info->match ^ info->invert) == 0;
else if (info->match & info->invert & XT_OWNER_SOCKET)
/*
@@ -43,7 +45,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par)
*/
return false;
- filp = skb->sk->sk_socket->file;
+ filp = sk->sk_socket->file;
if (filp == NULL)
return ((info->match ^ info->invert) &
(XT_OWNER_UID | XT_OWNER_GID)) == 0;
diff --git a/net/netfilter/xt_socket.c b/net/netfilter/xt_socket.c
index 2ec08f04b816..49d14ecad444 100644
--- a/net/netfilter/xt_socket.c
+++ b/net/netfilter/xt_socket.c
@@ -112,14 +112,15 @@ extract_icmp4_fields(const struct sk_buff *skb,
* box.
*/
static struct sock *
-xt_socket_get_sock_v4(struct net *net, const u8 protocol,
+xt_socket_get_sock_v4(struct net *net, struct sk_buff *skb, const int doff,
+ const u8 protocol,
const __be32 saddr, const __be32 daddr,
const __be16 sport, const __be16 dport,
const struct net_device *in)
{
switch (protocol) {
case IPPROTO_TCP:
- return __inet_lookup(net, &tcp_hashinfo,
+ return __inet_lookup(net, &tcp_hashinfo, skb, doff,
saddr, sport, daddr, dport,
in->ifindex);
case IPPROTO_UDP:
@@ -148,6 +149,8 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net,
const struct net_device *indev)
{
const struct iphdr *iph = ip_hdr(skb);
+ struct sk_buff *data_skb = NULL;
+ int doff = 0;
__be32 uninitialized_var(daddr), uninitialized_var(saddr);
__be16 uninitialized_var(dport), uninitialized_var(sport);
u8 uninitialized_var(protocol);
@@ -169,6 +172,10 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net,
sport = hp->source;
daddr = iph->daddr;
dport = hp->dest;
+ data_skb = (struct sk_buff *)skb;
+ doff = iph->protocol == IPPROTO_TCP ?
+ ip_hdrlen(skb) + __tcp_hdrlen((struct tcphdr *)hp) :
+ ip_hdrlen(skb) + sizeof(*hp);
} else if (iph->protocol == IPPROTO_ICMP) {
if (extract_icmp4_fields(skb, &protocol, &saddr, &daddr,
@@ -198,8 +205,8 @@ static struct sock *xt_socket_lookup_slow_v4(struct net *net,
}
#endif
- return xt_socket_get_sock_v4(net, protocol, saddr, daddr,
- sport, dport, indev);
+ return xt_socket_get_sock_v4(net, data_skb, doff, protocol, saddr,
+ daddr, sport, dport, indev);
}
static bool
@@ -318,14 +325,15 @@ extract_icmp6_fields(const struct sk_buff *skb,
}
static struct sock *
-xt_socket_get_sock_v6(struct net *net, const u8 protocol,
+xt_socket_get_sock_v6(struct net *net, struct sk_buff *skb, int doff,
+ const u8 protocol,
const struct in6_addr *saddr, const struct in6_addr *daddr,
const __be16 sport, const __be16 dport,
const struct net_device *in)
{
switch (protocol) {
case IPPROTO_TCP:
- return inet6_lookup(net, &tcp_hashinfo,
+ return inet6_lookup(net, &tcp_hashinfo, skb, doff,
saddr, sport, daddr, dport,
in->ifindex);
case IPPROTO_UDP:
@@ -343,6 +351,8 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net,
__be16 uninitialized_var(dport), uninitialized_var(sport);
const struct in6_addr *daddr = NULL, *saddr = NULL;
struct ipv6hdr *iph = ipv6_hdr(skb);
+ struct sk_buff *data_skb = NULL;
+ int doff = 0;
int thoff = 0, tproto;
tproto = ipv6_find_hdr(skb, &thoff, -1, NULL, NULL);
@@ -362,6 +372,10 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net,
sport = hp->source;
daddr = &iph->daddr;
dport = hp->dest;
+ data_skb = (struct sk_buff *)skb;
+ doff = tproto == IPPROTO_TCP ?
+ thoff + __tcp_hdrlen((struct tcphdr *)hp) :
+ thoff + sizeof(*hp);
} else if (tproto == IPPROTO_ICMPV6) {
struct ipv6hdr ipv6_var;
@@ -373,7 +387,7 @@ static struct sock *xt_socket_lookup_slow_v6(struct net *net,
return NULL;
}
- return xt_socket_get_sock_v6(net, tproto, saddr, daddr,
+ return xt_socket_get_sock_v6(net, data_skb, doff, tproto, saddr, daddr,
sport, dport, indev);
}
diff --git a/net/netlabel/netlabel_domainhash.c b/net/netlabel/netlabel_domainhash.c
index f0cb92f3ddaf..ada67422234b 100644
--- a/net/netlabel/netlabel_domainhash.c
+++ b/net/netlabel/netlabel_domainhash.c
@@ -55,8 +55,8 @@ struct netlbl_domhsh_tbl {
static DEFINE_SPINLOCK(netlbl_domhsh_lock);
#define netlbl_domhsh_rcu_deref(p) \
rcu_dereference_check(p, lockdep_is_held(&netlbl_domhsh_lock))
-static struct netlbl_domhsh_tbl *netlbl_domhsh = NULL;
-static struct netlbl_dom_map *netlbl_domhsh_def = NULL;
+static struct netlbl_domhsh_tbl *netlbl_domhsh;
+static struct netlbl_dom_map *netlbl_domhsh_def;
/*
* Domain Hash Table Helper Functions
diff --git a/net/netlabel/netlabel_unlabeled.c b/net/netlabel/netlabel_unlabeled.c
index b0380927f05f..9eaa9a1e8629 100644
--- a/net/netlabel/netlabel_unlabeled.c
+++ b/net/netlabel/netlabel_unlabeled.c
@@ -116,11 +116,11 @@ struct netlbl_unlhsh_walk_arg {
static DEFINE_SPINLOCK(netlbl_unlhsh_lock);
#define netlbl_unlhsh_rcu_deref(p) \
rcu_dereference_check(p, lockdep_is_held(&netlbl_unlhsh_lock))
-static struct netlbl_unlhsh_tbl *netlbl_unlhsh = NULL;
-static struct netlbl_unlhsh_iface *netlbl_unlhsh_def = NULL;
+static struct netlbl_unlhsh_tbl *netlbl_unlhsh;
+static struct netlbl_unlhsh_iface *netlbl_unlhsh_def;
/* Accept unlabeled packets flag */
-static u8 netlabel_unlabel_acceptflg = 0;
+static u8 netlabel_unlabel_acceptflg;
/* NetLabel Generic NETLINK unlabeled family */
static struct genl_family netlbl_unlabel_gnl_family = {
diff --git a/net/netlink/Kconfig b/net/netlink/Kconfig
index 2c5e95e9bfbd..5d6e8c05b3d4 100644
--- a/net/netlink/Kconfig
+++ b/net/netlink/Kconfig
@@ -2,15 +2,6 @@
# Netlink Sockets
#
-config NETLINK_MMAP
- bool "NETLINK: mmaped IO"
- ---help---
- This option enables support for memory mapped netlink IO. This
- reduces overhead by avoiding copying data between kernel- and
- userspace.
-
- If unsure, say N.
-
config NETLINK_DIAG
tristate "NETLINK: socket monitoring interface"
default n
diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index fafe33bdb619..215fc08c02ab 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -225,7 +225,7 @@ static int __netlink_deliver_tap_skb(struct sk_buff *skb,
dev_hold(dev);
- if (netlink_skb_is_mmaped(skb) || is_vmalloc_addr(skb->head))
+ if (is_vmalloc_addr(skb->head))
nskb = netlink_to_full_skb(skb, GFP_ATOMIC);
else
nskb = skb_clone(skb, GFP_ATOMIC);
@@ -300,610 +300,8 @@ static void netlink_rcv_wake(struct sock *sk)
wake_up_interruptible(&nlk->wait);
}
-#ifdef CONFIG_NETLINK_MMAP
-static bool netlink_rx_is_mmaped(struct sock *sk)
-{
- return nlk_sk(sk)->rx_ring.pg_vec != NULL;
-}
-
-static bool netlink_tx_is_mmaped(struct sock *sk)
-{
- return nlk_sk(sk)->tx_ring.pg_vec != NULL;
-}
-
-static __pure struct page *pgvec_to_page(const void *addr)
-{
- if (is_vmalloc_addr(addr))
- return vmalloc_to_page(addr);
- else
- return virt_to_page(addr);
-}
-
-static void free_pg_vec(void **pg_vec, unsigned int order, unsigned int len)
-{
- unsigned int i;
-
- for (i = 0; i < len; i++) {
- if (pg_vec[i] != NULL) {
- if (is_vmalloc_addr(pg_vec[i]))
- vfree(pg_vec[i]);
- else
- free_pages((unsigned long)pg_vec[i], order);
- }
- }
- kfree(pg_vec);
-}
-
-static void *alloc_one_pg_vec_page(unsigned long order)
-{
- void *buffer;
- gfp_t gfp_flags = GFP_KERNEL | __GFP_COMP | __GFP_ZERO |
- __GFP_NOWARN | __GFP_NORETRY;
-
- buffer = (void *)__get_free_pages(gfp_flags, order);
- if (buffer != NULL)
- return buffer;
-
- buffer = vzalloc((1 << order) * PAGE_SIZE);
- if (buffer != NULL)
- return buffer;
-
- gfp_flags &= ~__GFP_NORETRY;
- return (void *)__get_free_pages(gfp_flags, order);
-}
-
-static void **alloc_pg_vec(struct netlink_sock *nlk,
- struct nl_mmap_req *req, unsigned int order)
-{
- unsigned int block_nr = req->nm_block_nr;
- unsigned int i;
- void **pg_vec;
-
- pg_vec = kcalloc(block_nr, sizeof(void *), GFP_KERNEL);
- if (pg_vec == NULL)
- return NULL;
-
- for (i = 0; i < block_nr; i++) {
- pg_vec[i] = alloc_one_pg_vec_page(order);
- if (pg_vec[i] == NULL)
- goto err1;
- }
-
- return pg_vec;
-err1:
- free_pg_vec(pg_vec, order, block_nr);
- return NULL;
-}
-
-
-static void
-__netlink_set_ring(struct sock *sk, struct nl_mmap_req *req, bool tx_ring, void **pg_vec,
- unsigned int order)
-{
- struct netlink_sock *nlk = nlk_sk(sk);
- struct sk_buff_head *queue;
- struct netlink_ring *ring;
-
- queue = tx_ring ? &sk->sk_write_queue : &sk->sk_receive_queue;
- ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
-
- spin_lock_bh(&queue->lock);
-
- ring->frame_max = req->nm_frame_nr - 1;
- ring->head = 0;
- ring->frame_size = req->nm_frame_size;
- ring->pg_vec_pages = req->nm_block_size / PAGE_SIZE;
-
- swap(ring->pg_vec_len, req->nm_block_nr);
- swap(ring->pg_vec_order, order);
- swap(ring->pg_vec, pg_vec);
-
- __skb_queue_purge(queue);
- spin_unlock_bh(&queue->lock);
-
- WARN_ON(atomic_read(&nlk->mapped));
-
- if (pg_vec)
- free_pg_vec(pg_vec, order, req->nm_block_nr);
-}
-
-static int netlink_set_ring(struct sock *sk, struct nl_mmap_req *req,
- bool tx_ring)
-{
- struct netlink_sock *nlk = nlk_sk(sk);
- struct netlink_ring *ring;
- void **pg_vec = NULL;
- unsigned int order = 0;
-
- ring = tx_ring ? &nlk->tx_ring : &nlk->rx_ring;
-
- if (atomic_read(&nlk->mapped))
- return -EBUSY;
- if (atomic_read(&ring->pending))
- return -EBUSY;
-
- if (req->nm_block_nr) {
- if (ring->pg_vec != NULL)
- return -EBUSY;
-
- if ((int)req->nm_block_size <= 0)
- return -EINVAL;
- if (!PAGE_ALIGNED(req->nm_block_size))
- return -EINVAL;
- if (req->nm_frame_size < NL_MMAP_HDRLEN)
- return -EINVAL;
- if (!IS_ALIGNED(req->nm_frame_size, NL_MMAP_MSG_ALIGNMENT))
- return -EINVAL;
-
- ring->frames_per_block = req->nm_block_size /
- req->nm_frame_size;
- if (ring->frames_per_block == 0)
- return -EINVAL;
- if (ring->frames_per_block * req->nm_block_nr !=
- req->nm_frame_nr)
- return -EINVAL;
-
- order = get_order(req->nm_block_size);
- pg_vec = alloc_pg_vec(nlk, req, order);
- if (pg_vec == NULL)
- return -ENOMEM;
- } else {
- if (req->nm_frame_nr)
- return -EINVAL;
- }
-
- mutex_lock(&nlk->pg_vec_lock);
- if (atomic_read(&nlk->mapped) == 0) {
- __netlink_set_ring(sk, req, tx_ring, pg_vec, order);
- mutex_unlock(&nlk->pg_vec_lock);
- return 0;
- }
-
- mutex_unlock(&nlk->pg_vec_lock);
-
- if (pg_vec)
- free_pg_vec(pg_vec, order, req->nm_block_nr);
-
- return -EBUSY;
-}
-
-static void netlink_mm_open(struct vm_area_struct *vma)
-{
- struct file *file = vma->vm_file;
- struct socket *sock = file->private_data;
- struct sock *sk = sock->sk;
-
- if (sk)
- atomic_inc(&nlk_sk(sk)->mapped);
-}
-
-static void netlink_mm_close(struct vm_area_struct *vma)
-{
- struct file *file = vma->vm_file;
- struct socket *sock = file->private_data;
- struct sock *sk = sock->sk;
-
- if (sk)
- atomic_dec(&nlk_sk(sk)->mapped);
-}
-
-static const struct vm_operations_struct netlink_mmap_ops = {
- .open = netlink_mm_open,
- .close = netlink_mm_close,
-};
-
-static int netlink_mmap(struct file *file, struct socket *sock,
- struct vm_area_struct *vma)
-{
- struct sock *sk = sock->sk;
- struct netlink_sock *nlk = nlk_sk(sk);
- struct netlink_ring *ring;
- unsigned long start, size, expected;
- unsigned int i;
- int err = -EINVAL;
-
- if (vma->vm_pgoff)
- return -EINVAL;
-
- mutex_lock(&nlk->pg_vec_lock);
-
- expected = 0;
- for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
- if (ring->pg_vec == NULL)
- continue;
- expected += ring->pg_vec_len * ring->pg_vec_pages * PAGE_SIZE;
- }
-
- if (expected == 0)
- goto out;
-
- size = vma->vm_end - vma->vm_start;
- if (size != expected)
- goto out;
-
- start = vma->vm_start;
- for (ring = &nlk->rx_ring; ring <= &nlk->tx_ring; ring++) {
- if (ring->pg_vec == NULL)
- continue;
-
- for (i = 0; i < ring->pg_vec_len; i++) {
- struct page *page;
- void *kaddr = ring->pg_vec[i];
- unsigned int pg_num;
-
- for (pg_num = 0; pg_num < ring->pg_vec_pages; pg_num++) {
- page = pgvec_to_page(kaddr);
- err = vm_insert_page(vma, start, page);
- if (err < 0)
- goto out;
- start += PAGE_SIZE;
- kaddr += PAGE_SIZE;
- }
- }
- }
-
- atomic_inc(&nlk->mapped);
- vma->vm_ops = &netlink_mmap_ops;
- err = 0;
-out:
- mutex_unlock(&nlk->pg_vec_lock);
- return err;
-}
-
-static void netlink_frame_flush_dcache(const struct nl_mmap_hdr *hdr, unsigned int nm_len)
-{
-#if ARCH_IMPLEMENTS_FLUSH_DCACHE_PAGE == 1
- struct page *p_start, *p_end;
-
- /* First page is flushed through netlink_{get,set}_status */
- p_start = pgvec_to_page(hdr + PAGE_SIZE);
- p_end = pgvec_to_page((void *)hdr + NL_MMAP_HDRLEN + nm_len - 1);
- while (p_start <= p_end) {
- flush_dcache_page(p_start);
- p_start++;
- }
-#endif
-}
-
-static enum nl_mmap_status netlink_get_status(const struct nl_mmap_hdr *hdr)
-{
- smp_rmb();
- flush_dcache_page(pgvec_to_page(hdr));
- return hdr->nm_status;
-}
-
-static void netlink_set_status(struct nl_mmap_hdr *hdr,
- enum nl_mmap_status status)
-{
- smp_mb();
- hdr->nm_status = status;
- flush_dcache_page(pgvec_to_page(hdr));
-}
-
-static struct nl_mmap_hdr *
-__netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos)
-{
- unsigned int pg_vec_pos, frame_off;
-
- pg_vec_pos = pos / ring->frames_per_block;
- frame_off = pos % ring->frames_per_block;
-
- return ring->pg_vec[pg_vec_pos] + (frame_off * ring->frame_size);
-}
-
-static struct nl_mmap_hdr *
-netlink_lookup_frame(const struct netlink_ring *ring, unsigned int pos,
- enum nl_mmap_status status)
-{
- struct nl_mmap_hdr *hdr;
-
- hdr = __netlink_lookup_frame(ring, pos);
- if (netlink_get_status(hdr) != status)
- return NULL;
-
- return hdr;
-}
-
-static struct nl_mmap_hdr *
-netlink_current_frame(const struct netlink_ring *ring,
- enum nl_mmap_status status)
-{
- return netlink_lookup_frame(ring, ring->head, status);
-}
-
-static void netlink_increment_head(struct netlink_ring *ring)
-{
- ring->head = ring->head != ring->frame_max ? ring->head + 1 : 0;
-}
-
-static void netlink_forward_ring(struct netlink_ring *ring)
-{
- unsigned int head = ring->head;
- const struct nl_mmap_hdr *hdr;
-
- do {
- hdr = __netlink_lookup_frame(ring, ring->head);
- if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
- break;
- if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
- break;
- netlink_increment_head(ring);
- } while (ring->head != head);
-}
-
-static bool netlink_has_valid_frame(struct netlink_ring *ring)
-{
- unsigned int head = ring->head, pos = head;
- const struct nl_mmap_hdr *hdr;
-
- do {
- hdr = __netlink_lookup_frame(ring, pos);
- if (hdr->nm_status == NL_MMAP_STATUS_VALID)
- return true;
- pos = pos != 0 ? pos - 1 : ring->frame_max;
- } while (pos != head);
-
- return false;
-}
-
-static bool netlink_dump_space(struct netlink_sock *nlk)
-{
- struct netlink_ring *ring = &nlk->rx_ring;
- struct nl_mmap_hdr *hdr;
- unsigned int n;
-
- hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
- if (hdr == NULL)
- return false;
-
- n = ring->head + ring->frame_max / 2;
- if (n > ring->frame_max)
- n -= ring->frame_max;
-
- hdr = __netlink_lookup_frame(ring, n);
-
- return hdr->nm_status == NL_MMAP_STATUS_UNUSED;
-}
-
-static unsigned int netlink_poll(struct file *file, struct socket *sock,
- poll_table *wait)
-{
- struct sock *sk = sock->sk;
- struct netlink_sock *nlk = nlk_sk(sk);
- unsigned int mask;
- int err;
-
- if (nlk->rx_ring.pg_vec != NULL) {
- /* Memory mapped sockets don't call recvmsg(), so flow control
- * for dumps is performed here. A dump is allowed to continue
- * if at least half the ring is unused.
- */
- while (nlk->cb_running && netlink_dump_space(nlk)) {
- err = netlink_dump(sk);
- if (err < 0) {
- sk->sk_err = -err;
- sk->sk_error_report(sk);
- break;
- }
- }
- netlink_rcv_wake(sk);
- }
-
- mask = datagram_poll(file, sock, wait);
-
- /* We could already have received frames in the normal receive
- * queue, that will show up as NL_MMAP_STATUS_COPY in the ring,
- * so if mask contains pollin/etc already, there's no point
- * walking the ring.
- */
- if ((mask & (POLLIN | POLLRDNORM)) != (POLLIN | POLLRDNORM)) {
- spin_lock_bh(&sk->sk_receive_queue.lock);
- if (nlk->rx_ring.pg_vec) {
- if (netlink_has_valid_frame(&nlk->rx_ring))
- mask |= POLLIN | POLLRDNORM;
- }
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- }
-
- spin_lock_bh(&sk->sk_write_queue.lock);
- if (nlk->tx_ring.pg_vec) {
- if (netlink_current_frame(&nlk->tx_ring, NL_MMAP_STATUS_UNUSED))
- mask |= POLLOUT | POLLWRNORM;
- }
- spin_unlock_bh(&sk->sk_write_queue.lock);
-
- return mask;
-}
-
-static struct nl_mmap_hdr *netlink_mmap_hdr(struct sk_buff *skb)
-{
- return (struct nl_mmap_hdr *)(skb->head - NL_MMAP_HDRLEN);
-}
-
-static void netlink_ring_setup_skb(struct sk_buff *skb, struct sock *sk,
- struct netlink_ring *ring,
- struct nl_mmap_hdr *hdr)
-{
- unsigned int size;
- void *data;
-
- size = ring->frame_size - NL_MMAP_HDRLEN;
- data = (void *)hdr + NL_MMAP_HDRLEN;
-
- skb->head = data;
- skb->data = data;
- skb_reset_tail_pointer(skb);
- skb->end = skb->tail + size;
- skb->len = 0;
-
- skb->destructor = netlink_skb_destructor;
- NETLINK_CB(skb).flags |= NETLINK_SKB_MMAPED;
- NETLINK_CB(skb).sk = sk;
-}
-
-static int netlink_mmap_sendmsg(struct sock *sk, struct msghdr *msg,
- u32 dst_portid, u32 dst_group,
- struct scm_cookie *scm)
-{
- struct netlink_sock *nlk = nlk_sk(sk);
- struct netlink_ring *ring;
- struct nl_mmap_hdr *hdr;
- struct sk_buff *skb;
- unsigned int maxlen;
- int err = 0, len = 0;
-
- mutex_lock(&nlk->pg_vec_lock);
-
- ring = &nlk->tx_ring;
- maxlen = ring->frame_size - NL_MMAP_HDRLEN;
-
- do {
- unsigned int nm_len;
-
- hdr = netlink_current_frame(ring, NL_MMAP_STATUS_VALID);
- if (hdr == NULL) {
- if (!(msg->msg_flags & MSG_DONTWAIT) &&
- atomic_read(&nlk->tx_ring.pending))
- schedule();
- continue;
- }
-
- nm_len = ACCESS_ONCE(hdr->nm_len);
- if (nm_len > maxlen) {
- err = -EINVAL;
- goto out;
- }
-
- netlink_frame_flush_dcache(hdr, nm_len);
-
- skb = alloc_skb(nm_len, GFP_KERNEL);
- if (skb == NULL) {
- err = -ENOBUFS;
- goto out;
- }
- __skb_put(skb, nm_len);
- memcpy(skb->data, (void *)hdr + NL_MMAP_HDRLEN, nm_len);
- netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
-
- netlink_increment_head(ring);
-
- NETLINK_CB(skb).portid = nlk->portid;
- NETLINK_CB(skb).dst_group = dst_group;
- NETLINK_CB(skb).creds = scm->creds;
-
- err = security_netlink_send(sk, skb);
- if (err) {
- kfree_skb(skb);
- goto out;
- }
-
- if (unlikely(dst_group)) {
- atomic_inc(&skb->users);
- netlink_broadcast(sk, skb, dst_portid, dst_group,
- GFP_KERNEL);
- }
- err = netlink_unicast(sk, skb, dst_portid,
- msg->msg_flags & MSG_DONTWAIT);
- if (err < 0)
- goto out;
- len += err;
-
- } while (hdr != NULL ||
- (!(msg->msg_flags & MSG_DONTWAIT) &&
- atomic_read(&nlk->tx_ring.pending)));
-
- if (len > 0)
- err = len;
-out:
- mutex_unlock(&nlk->pg_vec_lock);
- return err;
-}
-
-static void netlink_queue_mmaped_skb(struct sock *sk, struct sk_buff *skb)
-{
- struct nl_mmap_hdr *hdr;
-
- hdr = netlink_mmap_hdr(skb);
- hdr->nm_len = skb->len;
- hdr->nm_group = NETLINK_CB(skb).dst_group;
- hdr->nm_pid = NETLINK_CB(skb).creds.pid;
- hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
- hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
- netlink_frame_flush_dcache(hdr, hdr->nm_len);
- netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
-
- NETLINK_CB(skb).flags |= NETLINK_SKB_DELIVERED;
- kfree_skb(skb);
-}
-
-static void netlink_ring_set_copied(struct sock *sk, struct sk_buff *skb)
-{
- struct netlink_sock *nlk = nlk_sk(sk);
- struct netlink_ring *ring = &nlk->rx_ring;
- struct nl_mmap_hdr *hdr;
-
- spin_lock_bh(&sk->sk_receive_queue.lock);
- hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
- if (hdr == NULL) {
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- kfree_skb(skb);
- netlink_overrun(sk);
- return;
- }
- netlink_increment_head(ring);
- __skb_queue_tail(&sk->sk_receive_queue, skb);
- spin_unlock_bh(&sk->sk_receive_queue.lock);
-
- hdr->nm_len = skb->len;
- hdr->nm_group = NETLINK_CB(skb).dst_group;
- hdr->nm_pid = NETLINK_CB(skb).creds.pid;
- hdr->nm_uid = from_kuid(sk_user_ns(sk), NETLINK_CB(skb).creds.uid);
- hdr->nm_gid = from_kgid(sk_user_ns(sk), NETLINK_CB(skb).creds.gid);
- netlink_set_status(hdr, NL_MMAP_STATUS_COPY);
-}
-
-#else /* CONFIG_NETLINK_MMAP */
-#define netlink_rx_is_mmaped(sk) false
-#define netlink_tx_is_mmaped(sk) false
-#define netlink_mmap sock_no_mmap
-#define netlink_poll datagram_poll
-#define netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group, scm) 0
-#endif /* CONFIG_NETLINK_MMAP */
-
static void netlink_skb_destructor(struct sk_buff *skb)
{
-#ifdef CONFIG_NETLINK_MMAP
- struct nl_mmap_hdr *hdr;
- struct netlink_ring *ring;
- struct sock *sk;
-
- /* If a packet from the kernel to userspace was freed because of an
- * error without being delivered to userspace, the kernel must reset
- * the status. In the direction userspace to kernel, the status is
- * always reset here after the packet was processed and freed.
- */
- if (netlink_skb_is_mmaped(skb)) {
- hdr = netlink_mmap_hdr(skb);
- sk = NETLINK_CB(skb).sk;
-
- if (NETLINK_CB(skb).flags & NETLINK_SKB_TX) {
- netlink_set_status(hdr, NL_MMAP_STATUS_UNUSED);
- ring = &nlk_sk(sk)->tx_ring;
- } else {
- if (!(NETLINK_CB(skb).flags & NETLINK_SKB_DELIVERED)) {
- hdr->nm_len = 0;
- netlink_set_status(hdr, NL_MMAP_STATUS_VALID);
- }
- ring = &nlk_sk(sk)->rx_ring;
- }
-
- WARN_ON(atomic_read(&ring->pending) == 0);
- atomic_dec(&ring->pending);
- sock_put(sk);
-
- skb->head = NULL;
- }
-#endif
if (is_vmalloc_addr(skb->head)) {
if (!skb->cloned ||
!atomic_dec_return(&(skb_shinfo(skb)->dataref)))
@@ -937,18 +335,6 @@ static void netlink_sock_destruct(struct sock *sk)
}
skb_queue_purge(&sk->sk_receive_queue);
-#ifdef CONFIG_NETLINK_MMAP
- if (1) {
- struct nl_mmap_req req;
-
- memset(&req, 0, sizeof(req));
- if (nlk->rx_ring.pg_vec)
- __netlink_set_ring(sk, &req, false, NULL, 0);
- memset(&req, 0, sizeof(req));
- if (nlk->tx_ring.pg_vec)
- __netlink_set_ring(sk, &req, true, NULL, 0);
- }
-#endif /* CONFIG_NETLINK_MMAP */
if (!sock_flag(sk, SOCK_DEAD)) {
printk(KERN_ERR "Freeing alive netlink socket %p\n", sk);
@@ -1194,9 +580,6 @@ static int __netlink_create(struct net *net, struct socket *sock,
mutex_init(nlk->cb_mutex);
}
init_waitqueue_head(&nlk->wait);
-#ifdef CONFIG_NETLINK_MMAP
- mutex_init(&nlk->pg_vec_lock);
-#endif
sk->sk_destruct = netlink_sock_destruct;
sk->sk_protocol = protocol;
@@ -1650,6 +1033,14 @@ static int netlink_getname(struct socket *sock, struct sockaddr *addr,
return 0;
}
+static int netlink_ioctl(struct socket *sock, unsigned int cmd,
+ unsigned long arg)
+{
+ /* try to hand this ioctl down to the NIC drivers.
+ */
+ return -ENOIOCTLCMD;
+}
+
static struct sock *netlink_getsockbyportid(struct sock *ssk, u32 portid)
{
struct sock *sock;
@@ -1728,8 +1119,7 @@ int netlink_attachskb(struct sock *sk, struct sk_buff *skb,
nlk = nlk_sk(sk);
if ((atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf ||
- test_bit(NETLINK_S_CONGESTED, &nlk->state)) &&
- !netlink_skb_is_mmaped(skb)) {
+ test_bit(NETLINK_S_CONGESTED, &nlk->state))) {
DECLARE_WAITQUEUE(wait, current);
if (!*timeo) {
if (!ssk || netlink_is_kernel(ssk))
@@ -1767,14 +1157,7 @@ static int __netlink_sendskb(struct sock *sk, struct sk_buff *skb)
netlink_deliver_tap(skb);
-#ifdef CONFIG_NETLINK_MMAP
- if (netlink_skb_is_mmaped(skb))
- netlink_queue_mmaped_skb(sk, skb);
- else if (netlink_rx_is_mmaped(sk))
- netlink_ring_set_copied(sk, skb);
- else
-#endif /* CONFIG_NETLINK_MMAP */
- skb_queue_tail(&sk->sk_receive_queue, skb);
+ skb_queue_tail(&sk->sk_receive_queue, skb);
sk->sk_data_ready(sk);
return len;
}
@@ -1798,9 +1181,6 @@ static struct sk_buff *netlink_trim(struct sk_buff *skb, gfp_t allocation)
int delta;
WARN_ON(skb->sk != NULL);
- if (netlink_skb_is_mmaped(skb))
- return skb;
-
delta = skb->end - skb->tail;
if (is_vmalloc_addr(skb->head) || delta * 2 < skb->truesize)
return skb;
@@ -1876,79 +1256,6 @@ retry:
}
EXPORT_SYMBOL(netlink_unicast);
-struct sk_buff *__netlink_alloc_skb(struct sock *ssk, unsigned int size,
- unsigned int ldiff, u32 dst_portid,
- gfp_t gfp_mask)
-{
-#ifdef CONFIG_NETLINK_MMAP
- unsigned int maxlen, linear_size;
- struct sock *sk = NULL;
- struct sk_buff *skb;
- struct netlink_ring *ring;
- struct nl_mmap_hdr *hdr;
-
- sk = netlink_getsockbyportid(ssk, dst_portid);
- if (IS_ERR(sk))
- goto out;
-
- ring = &nlk_sk(sk)->rx_ring;
- /* fast-path without atomic ops for common case: non-mmaped receiver */
- if (ring->pg_vec == NULL)
- goto out_put;
-
- /* We need to account the full linear size needed as a ring
- * slot cannot have non-linear parts.
- */
- linear_size = size + ldiff;
- if (ring->frame_size - NL_MMAP_HDRLEN < linear_size)
- goto out_put;
-
- skb = alloc_skb_head(gfp_mask);
- if (skb == NULL)
- goto err1;
-
- spin_lock_bh(&sk->sk_receive_queue.lock);
- /* check again under lock */
- if (ring->pg_vec == NULL)
- goto out_free;
-
- /* check again under lock */
- maxlen = ring->frame_size - NL_MMAP_HDRLEN;
- if (maxlen < linear_size)
- goto out_free;
-
- netlink_forward_ring(ring);
- hdr = netlink_current_frame(ring, NL_MMAP_STATUS_UNUSED);
- if (hdr == NULL)
- goto err2;
-
- netlink_ring_setup_skb(skb, sk, ring, hdr);
- netlink_set_status(hdr, NL_MMAP_STATUS_RESERVED);
- atomic_inc(&ring->pending);
- netlink_increment_head(ring);
-
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- return skb;
-
-err2:
- kfree_skb(skb);
- spin_unlock_bh(&sk->sk_receive_queue.lock);
- netlink_overrun(sk);
-err1:
- sock_put(sk);
- return NULL;
-
-out_free:
- kfree_skb(skb);
- spin_unlock_bh(&sk->sk_receive_queue.lock);
-out_put:
- sock_put(sk);
-out:
-#endif
- return alloc_skb(size, gfp_mask);
-}
-EXPORT_SYMBOL_GPL(__netlink_alloc_skb);
-
int netlink_has_listeners(struct sock *sk, unsigned int group)
{
int res = 0;
@@ -2116,7 +1423,7 @@ int netlink_broadcast_filtered(struct sock *ssk, struct sk_buff *skb, u32 portid
consume_skb(info.skb2);
if (info.delivered) {
- if (info.congested && (allocation & __GFP_WAIT))
+ if (info.congested && gfpflags_allow_blocking(allocation))
yield();
return 0;
}
@@ -2225,8 +1532,7 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
if (level != SOL_NETLINK)
return -ENOPROTOOPT;
- if (optname != NETLINK_RX_RING && optname != NETLINK_TX_RING &&
- optlen >= sizeof(int) &&
+ if (optlen >= sizeof(int) &&
get_user(val, (unsigned int __user *)optval))
return -EFAULT;
@@ -2279,25 +1585,6 @@ static int netlink_setsockopt(struct socket *sock, int level, int optname,
}
err = 0;
break;
-#ifdef CONFIG_NETLINK_MMAP
- case NETLINK_RX_RING:
- case NETLINK_TX_RING: {
- struct nl_mmap_req req;
-
- /* Rings might consume more memory than queue limits, require
- * CAP_NET_ADMIN.
- */
- if (!capable(CAP_NET_ADMIN))
- return -EPERM;
- if (optlen < sizeof(req))
- return -EINVAL;
- if (copy_from_user(&req, optval, sizeof(req)))
- return -EFAULT;
- err = netlink_set_ring(sk, &req,
- optname == NETLINK_TX_RING);
- break;
- }
-#endif /* CONFIG_NETLINK_MMAP */
case NETLINK_LISTEN_ALL_NSID:
if (!ns_capable(sock_net(sk)->user_ns, CAP_NET_BROADCAST))
return -EPERM;
@@ -2467,18 +1754,6 @@ static int netlink_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
smp_rmb();
}
- /* It's a really convoluted way for userland to ask for mmaped
- * sendmsg(), but that's what we've got...
- */
- if (netlink_tx_is_mmaped(sk) &&
- iter_is_iovec(&msg->msg_iter) &&
- msg->msg_iter.nr_segs == 1 &&
- msg->msg_iter.iov->iov_base == NULL) {
- err = netlink_mmap_sendmsg(sk, msg, dst_portid, dst_group,
- &scm);
- goto out;
- }
-
err = -EMSGSIZE;
if (len > sk->sk_sndbuf - 32)
goto out;
@@ -2794,8 +2069,7 @@ static int netlink_dump(struct sock *sk)
goto errout_skb;
}
- if (!netlink_rx_is_mmaped(sk) &&
- atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
+ if (atomic_read(&sk->sk_rmem_alloc) >= sk->sk_rcvbuf)
goto errout_skb;
/* NLMSG_GOODSIZE is small to avoid high order allocations being
@@ -2808,15 +2082,12 @@ static int netlink_dump(struct sock *sk)
if (alloc_min_size < nlk->max_recvmsg_len) {
alloc_size = nlk->max_recvmsg_len;
- skb = netlink_alloc_skb(sk, alloc_size, nlk->portid,
- GFP_KERNEL |
- __GFP_NOWARN |
- __GFP_NORETRY);
+ skb = alloc_skb(alloc_size, GFP_KERNEL |
+ __GFP_NOWARN | __GFP_NORETRY);
}
if (!skb) {
alloc_size = alloc_min_size;
- skb = netlink_alloc_skb(sk, alloc_size, nlk->portid,
- GFP_KERNEL);
+ skb = alloc_skb(alloc_size, GFP_KERNEL);
}
if (!skb)
goto errout_skb;
@@ -2883,16 +2154,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
struct netlink_sock *nlk;
int ret;
- /* Memory mapped dump requests need to be copied to avoid looping
- * on the pending state in netlink_mmap_sendmsg() while the CB hold
- * a reference to the skb.
- */
- if (netlink_skb_is_mmaped(skb)) {
- skb = skb_copy(skb, GFP_KERNEL);
- if (skb == NULL)
- return -ENOBUFS;
- } else
- atomic_inc(&skb->users);
+ atomic_inc(&skb->users);
sk = netlink_lookup(sock_net(ssk), ssk->sk_protocol, NETLINK_CB(skb).portid);
if (sk == NULL) {
@@ -2915,6 +2177,7 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
cb = &nlk->cb;
memset(cb, 0, sizeof(*cb));
+ cb->start = control->start;
cb->dump = control->dump;
cb->done = control->done;
cb->nlh = nlh;
@@ -2927,6 +2190,9 @@ int __netlink_dump_start(struct sock *ssk, struct sk_buff *skb,
mutex_unlock(nlk->cb_mutex);
+ if (cb->start)
+ cb->start(cb);
+
ret = netlink_dump(sk);
sock_put(sk);
@@ -2961,8 +2227,7 @@ void netlink_ack(struct sk_buff *in_skb, struct nlmsghdr *nlh, int err)
if (!(nlk->flags & NETLINK_F_CAP_ACK) && err)
payload += nlmsg_len(nlh);
- skb = netlink_alloc_skb(in_skb->sk, nlmsg_total_size(payload),
- NETLINK_CB(in_skb).portid, GFP_KERNEL);
+ skb = nlmsg_new(payload, GFP_KERNEL);
if (!skb) {
struct sock *sk;
@@ -3236,15 +2501,15 @@ static const struct proto_ops netlink_ops = {
.socketpair = sock_no_socketpair,
.accept = sock_no_accept,
.getname = netlink_getname,
- .poll = netlink_poll,
- .ioctl = sock_no_ioctl,
+ .poll = datagram_poll,
+ .ioctl = netlink_ioctl,
.listen = sock_no_listen,
.shutdown = sock_no_shutdown,
.setsockopt = netlink_setsockopt,
.getsockopt = netlink_getsockopt,
.sendmsg = netlink_sendmsg,
.recvmsg = netlink_recvmsg,
- .mmap = netlink_mmap,
+ .mmap = sock_no_mmap,
.sendpage = sock_no_sendpage,
};
diff --git a/net/netlink/af_netlink.h b/net/netlink/af_netlink.h
index 14437d9b1965..e68ef9ccd703 100644
--- a/net/netlink/af_netlink.h
+++ b/net/netlink/af_netlink.h
@@ -44,12 +44,6 @@ struct netlink_sock {
int (*netlink_bind)(struct net *net, int group);
void (*netlink_unbind)(struct net *net, int group);
struct module *module;
-#ifdef CONFIG_NETLINK_MMAP
- struct mutex pg_vec_lock;
- struct netlink_ring rx_ring;
- struct netlink_ring tx_ring;
- atomic_t mapped;
-#endif /* CONFIG_NETLINK_MMAP */
struct rhash_head node;
struct rcu_head rcu;
@@ -60,15 +54,6 @@ static inline struct netlink_sock *nlk_sk(struct sock *sk)
return container_of(sk, struct netlink_sock, sk);
}
-static inline bool netlink_skb_is_mmaped(const struct sk_buff *skb)
-{
-#ifdef CONFIG_NETLINK_MMAP
- return NETLINK_CB(skb).flags & NETLINK_SKB_MMAPED;
-#else
- return false;
-#endif /* CONFIG_NETLINK_MMAP */
-}
-
struct netlink_table {
struct rhashtable hash;
struct hlist_head mc_list;
diff --git a/net/netlink/diag.c b/net/netlink/diag.c
index 3ee63a3cff30..8dd836a8dd60 100644
--- a/net/netlink/diag.c
+++ b/net/netlink/diag.c
@@ -8,41 +8,6 @@
#include "af_netlink.h"
-#ifdef CONFIG_NETLINK_MMAP
-static int sk_diag_put_ring(struct netlink_ring *ring, int nl_type,
- struct sk_buff *nlskb)
-{
- struct netlink_diag_ring ndr;
-
- ndr.ndr_block_size = ring->pg_vec_pages << PAGE_SHIFT;
- ndr.ndr_block_nr = ring->pg_vec_len;
- ndr.ndr_frame_size = ring->frame_size;
- ndr.ndr_frame_nr = ring->frame_max + 1;
-
- return nla_put(nlskb, nl_type, sizeof(ndr), &ndr);
-}
-
-static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
-{
- struct netlink_sock *nlk = nlk_sk(sk);
- int ret;
-
- mutex_lock(&nlk->pg_vec_lock);
- ret = sk_diag_put_ring(&nlk->rx_ring, NETLINK_DIAG_RX_RING, nlskb);
- if (!ret)
- ret = sk_diag_put_ring(&nlk->tx_ring, NETLINK_DIAG_TX_RING,
- nlskb);
- mutex_unlock(&nlk->pg_vec_lock);
-
- return ret;
-}
-#else
-static int sk_diag_put_rings_cfg(struct sock *sk, struct sk_buff *nlskb)
-{
- return 0;
-}
-#endif
-
static int sk_diag_dump_groups(struct sock *sk, struct sk_buff *nlskb)
{
struct netlink_sock *nlk = nlk_sk(sk);
@@ -87,10 +52,6 @@ static int sk_diag_fill(struct sock *sk, struct sk_buff *skb,
sock_diag_put_meminfo(sk, skb, NETLINK_DIAG_MEMINFO))
goto out_nlmsg_trim;
- if ((req->ndiag_show & NDIAG_SHOW_RING_CFG) &&
- sk_diag_put_rings_cfg(sk, skb))
- goto out_nlmsg_trim;
-
nlmsg_end(skb, nlh);
return 0;
diff --git a/net/netlink/genetlink.c b/net/netlink/genetlink.c
index bc0e504f33a6..a09132a69869 100644
--- a/net/netlink/genetlink.c
+++ b/net/netlink/genetlink.c
@@ -185,7 +185,7 @@ static int genl_allocate_reserve_groups(int n_groups, int *first_id)
}
}
- if (id >= mc_groups_longs * BITS_PER_LONG) {
+ if (id + n_groups > mc_groups_longs * BITS_PER_LONG) {
unsigned long new_longs = mc_groups_longs +
BITS_TO_LONGS(n_groups);
size_t nlen = new_longs * sizeof(unsigned long);
@@ -463,26 +463,6 @@ int genl_unregister_family(struct genl_family *family)
EXPORT_SYMBOL(genl_unregister_family);
/**
- * genlmsg_new_unicast - Allocate generic netlink message for unicast
- * @payload: size of the message payload
- * @info: information on destination
- * @flags: the type of memory to allocate
- *
- * Allocates a new sk_buff large enough to cover the specified payload
- * plus required Netlink headers. Will check receiving socket for
- * memory mapped i/o capability and use it if enabled. Will fall back
- * to non-mapped skb if message size exceeds the frame size of the ring.
- */
-struct sk_buff *genlmsg_new_unicast(size_t payload, struct genl_info *info,
- gfp_t flags)
-{
- size_t len = nlmsg_total_size(genlmsg_total_size(payload));
-
- return netlink_alloc_skb(info->dst_sk, len, info->snd_portid, flags);
-}
-EXPORT_SYMBOL_GPL(genlmsg_new_unicast);
-
-/**
* genlmsg_put - Add generic netlink header to netlink message
* @skb: socket buffer holding the message
* @portid: netlink portid the message is addressed to
@@ -513,6 +493,20 @@ void *genlmsg_put(struct sk_buff *skb, u32 portid, u32 seq,
}
EXPORT_SYMBOL(genlmsg_put);
+static int genl_lock_start(struct netlink_callback *cb)
+{
+ /* our ops are always const - netlink API doesn't propagate that */
+ const struct genl_ops *ops = cb->data;
+ int rc = 0;
+
+ if (ops->start) {
+ genl_lock();
+ rc = ops->start(cb);
+ genl_unlock();
+ }
+ return rc;
+}
+
static int genl_lock_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
{
/* our ops are always const - netlink API doesn't propagate that */
@@ -566,6 +560,10 @@ static int genl_family_rcv_msg(struct genl_family *family,
!netlink_capable(skb, CAP_NET_ADMIN))
return -EPERM;
+ if ((ops->flags & GENL_UNS_ADMIN_PERM) &&
+ !netlink_ns_capable(skb, net->user_ns, CAP_NET_ADMIN))
+ return -EPERM;
+
if ((nlh->nlmsg_flags & NLM_F_DUMP) == NLM_F_DUMP) {
int rc;
@@ -577,6 +575,7 @@ static int genl_family_rcv_msg(struct genl_family *family,
.module = family->module,
/* we have const, but the netlink API doesn't */
.data = (void *)ops,
+ .start = genl_lock_start,
.dump = genl_lock_dumpit,
.done = genl_lock_done,
};
@@ -588,6 +587,7 @@ static int genl_family_rcv_msg(struct genl_family *family,
} else {
struct netlink_dump_control c = {
.module = family->module,
+ .start = ops->start,
.dump = ops->dumpit,
.done = ops->done,
};
@@ -622,7 +622,6 @@ static int genl_family_rcv_msg(struct genl_family *family,
info.genlhdr = nlmsg_data(nlh);
info.userhdr = nlmsg_data(nlh) + GENL_HDRLEN;
info.attrs = attrbuf;
- info.dst_sk = skb->sk;
genl_info_net_set(&info, net);
memset(&info.user_ptr, 0, sizeof(info.user_ptr));
diff --git a/net/nfc/core.c b/net/nfc/core.c
index 1fe3d3b362c0..122bb81da918 100644
--- a/net/nfc/core.c
+++ b/net/nfc/core.c
@@ -953,6 +953,19 @@ out:
}
EXPORT_SYMBOL(nfc_se_transaction);
+int nfc_se_connectivity(struct nfc_dev *dev, u8 se_idx)
+{
+ int rc;
+
+ pr_debug("connectivity: %x\n", se_idx);
+
+ device_lock(&dev->dev);
+ rc = nfc_genl_se_connectivity(dev, se_idx);
+ device_unlock(&dev->dev);
+ return rc;
+}
+EXPORT_SYMBOL(nfc_se_connectivity);
+
static void nfc_release(struct device *d)
{
struct nfc_dev *dev = to_nfc_dev(d);
diff --git a/net/nfc/digital_core.c b/net/nfc/digital_core.c
index 23c2a118ac9f..dd9003f38822 100644
--- a/net/nfc/digital_core.c
+++ b/net/nfc/digital_core.c
@@ -20,7 +20,8 @@
#include "digital.h"
#define DIGITAL_PROTO_NFCA_RF_TECH \
- (NFC_PROTO_JEWEL_MASK | NFC_PROTO_MIFARE_MASK | NFC_PROTO_NFC_DEP_MASK)
+ (NFC_PROTO_JEWEL_MASK | NFC_PROTO_MIFARE_MASK | \
+ NFC_PROTO_NFC_DEP_MASK | NFC_PROTO_ISO14443_MASK)
#define DIGITAL_PROTO_NFCB_RF_TECH NFC_PROTO_ISO14443_B_MASK
diff --git a/net/nfc/llcp_commands.c b/net/nfc/llcp_commands.c
index 3621a902cb6e..3425532c39f7 100644
--- a/net/nfc/llcp_commands.c
+++ b/net/nfc/llcp_commands.c
@@ -663,7 +663,7 @@ int nfc_llcp_send_i_frame(struct nfc_llcp_sock *sock,
return -ENOBUFS;
}
- msg_data = kzalloc(len, GFP_KERNEL);
+ msg_data = kmalloc(len, GFP_USER | __GFP_NOWARN);
if (msg_data == NULL)
return -ENOMEM;
@@ -729,7 +729,7 @@ int nfc_llcp_send_ui_frame(struct nfc_llcp_sock *sock, u8 ssap, u8 dsap,
if (local == NULL)
return -ENODEV;
- msg_data = kzalloc(len, GFP_KERNEL);
+ msg_data = kmalloc(len, GFP_USER | __GFP_NOWARN);
if (msg_data == NULL)
return -ENOMEM;
diff --git a/net/nfc/llcp_sock.c b/net/nfc/llcp_sock.c
index b7de0da46acd..b9edf5fae6ae 100644
--- a/net/nfc/llcp_sock.c
+++ b/net/nfc/llcp_sock.c
@@ -509,6 +509,11 @@ static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr,
memset(llcp_addr, 0, sizeof(*llcp_addr));
*len = sizeof(struct sockaddr_nfc_llcp);
+ lock_sock(sk);
+ if (!llcp_sock->dev) {
+ release_sock(sk);
+ return -EBADFD;
+ }
llcp_addr->sa_family = AF_NFC;
llcp_addr->dev_idx = llcp_sock->dev->idx;
llcp_addr->target_idx = llcp_sock->target_idx;
@@ -518,6 +523,7 @@ static int llcp_sock_getname(struct socket *sock, struct sockaddr *uaddr,
llcp_addr->service_name_len = llcp_sock->service_name_len;
memcpy(llcp_addr->service_name, llcp_sock->service_name,
llcp_addr->service_name_len);
+ release_sock(sk);
return 0;
}
@@ -572,7 +578,7 @@ static unsigned int llcp_sock_poll(struct file *file, struct socket *sock,
if (sock_writeable(sk) && sk->sk_state == LLCP_CONNECTED)
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
else
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
pr_debug("mask 0x%x\n", mask);
diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
index 10c99a578421..fbb7a2b57b44 100644
--- a/net/nfc/nci/core.c
+++ b/net/nfc/nci/core.c
@@ -610,14 +610,14 @@ int nci_core_conn_create(struct nci_dev *ndev, u8 destination_type,
struct nci_core_conn_create_cmd *cmd;
struct core_conn_create_data data;
+ if (!number_destination_params)
+ return -EINVAL;
+
data.length = params_len + sizeof(struct nci_core_conn_create_cmd);
cmd = kzalloc(data.length, GFP_KERNEL);
if (!cmd)
return -ENOMEM;
- if (!number_destination_params)
- return -EINVAL;
-
cmd->destination_type = destination_type;
cmd->number_destination_params = number_destination_params;
memcpy(cmd->params, params, params_len);
diff --git a/net/nfc/nci/hci.c b/net/nfc/nci/hci.c
index 2aedac15cb59..a0ab26d535dc 100644
--- a/net/nfc/nci/hci.c
+++ b/net/nfc/nci/hci.c
@@ -676,7 +676,7 @@ int nci_hci_connect_gate(struct nci_dev *ndev,
break;
default:
pipe = nci_hci_create_pipe(ndev, dest_host, dest_gate, &r);
- if (pipe < 0)
+ if (pipe == NCI_HCI_INVALID_PIPE)
return r;
pipe_created = true;
break;
diff --git a/net/nfc/nci/uart.c b/net/nfc/nci/uart.c
index 21d8875673a4..c468eabd6943 100644
--- a/net/nfc/nci/uart.c
+++ b/net/nfc/nci/uart.c
@@ -171,14 +171,7 @@ static int nci_uart_tty_open(struct tty_struct *tty)
tty->disc_data = NULL;
tty->receive_room = 65536;
- /* Flush any pending characters in the driver and line discipline. */
-
- /* FIXME: why is this needed. Note don't use ldisc_ref here as the
- * open path is before the ldisc is referencable.
- */
-
- if (tty->ldisc->ops->flush_buffer)
- tty->ldisc->ops->flush_buffer(tty);
+ /* Flush any pending characters in the driver */
tty_driver_flush_buffer(tty);
return 0;
diff --git a/net/nfc/netlink.c b/net/nfc/netlink.c
index f58c1fba1026..ea023b35f1c2 100644
--- a/net/nfc/netlink.c
+++ b/net/nfc/netlink.c
@@ -552,6 +552,43 @@ free_msg:
return -EMSGSIZE;
}
+int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx)
+{
+ struct nfc_se *se;
+ struct sk_buff *msg;
+ void *hdr;
+
+ msg = nlmsg_new(NLMSG_DEFAULT_SIZE, GFP_KERNEL);
+ if (!msg)
+ return -ENOMEM;
+
+ hdr = genlmsg_put(msg, 0, 0, &nfc_genl_family, 0,
+ NFC_EVENT_SE_CONNECTIVITY);
+ if (!hdr)
+ goto free_msg;
+
+ se = nfc_find_se(dev, se_idx);
+ if (!se)
+ goto free_msg;
+
+ if (nla_put_u32(msg, NFC_ATTR_DEVICE_INDEX, dev->idx) ||
+ nla_put_u32(msg, NFC_ATTR_SE_INDEX, se_idx) ||
+ nla_put_u8(msg, NFC_ATTR_SE_TYPE, se->type))
+ goto nla_put_failure;
+
+ genlmsg_end(msg, hdr);
+
+ genlmsg_multicast(&nfc_genl_family, msg, 0, 0, GFP_KERNEL);
+
+ return 0;
+
+nla_put_failure:
+ genlmsg_cancel(msg, hdr);
+free_msg:
+ nlmsg_free(msg);
+ return -EMSGSIZE;
+}
+
static int nfc_genl_send_device(struct sk_buff *msg, struct nfc_dev *dev,
u32 portid, u32 seq,
struct netlink_callback *cb,
diff --git a/net/nfc/nfc.h b/net/nfc/nfc.h
index c20b784ad720..6c6f76b370b1 100644
--- a/net/nfc/nfc.h
+++ b/net/nfc/nfc.h
@@ -105,6 +105,7 @@ int nfc_genl_se_added(struct nfc_dev *dev, u32 se_idx, u16 type);
int nfc_genl_se_removed(struct nfc_dev *dev, u32 se_idx);
int nfc_genl_se_transaction(struct nfc_dev *dev, u8 se_idx,
struct nfc_evt_transaction *evt_transaction);
+int nfc_genl_se_connectivity(struct nfc_dev *dev, u8 se_idx);
struct nfc_dev *nfc_get_device(unsigned int idx);
diff --git a/net/openvswitch/Kconfig b/net/openvswitch/Kconfig
index d143aa9f6654..ce947292ae77 100644
--- a/net/openvswitch/Kconfig
+++ b/net/openvswitch/Kconfig
@@ -6,10 +6,14 @@ config OPENVSWITCH
tristate "Open vSwitch"
depends on INET
depends on !NF_CONNTRACK || \
- (NF_CONNTRACK && (!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6))
+ (NF_CONNTRACK && ((!NF_DEFRAG_IPV6 || NF_DEFRAG_IPV6) && \
+ (!NF_NAT || NF_NAT) && \
+ (!NF_NAT_IPV4 || NF_NAT_IPV4) && \
+ (!NF_NAT_IPV6 || NF_NAT_IPV6)))
select LIBCRC32C
select MPLS
select NET_MPLS_GSO
+ select DST_CACHE
---help---
Open vSwitch is a multilayer Ethernet switch targeted at virtualized
environments. In addition to supporting a variety of features
diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index c88d0f2d3e01..e9dd47b2a85b 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -158,9 +158,7 @@ static int push_mpls(struct sk_buff *skb, struct sw_flow_key *key,
new_mpls_lse = (__be32 *)skb_mpls_header(skb);
*new_mpls_lse = mpls->mpls_lse;
- if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->csum = csum_add(skb->csum, csum_partial(new_mpls_lse,
- MPLS_HLEN, 0));
+ skb_postpush_rcsum(skb, new_mpls_lse, MPLS_HLEN);
hdr = eth_hdr(skb);
hdr->h_proto = mpls->mpls_ethertype;
@@ -280,7 +278,7 @@ static int set_eth_addr(struct sk_buff *skb, struct sw_flow_key *flow_key,
ether_addr_copy_masked(eth_hdr(skb)->h_dest, key->eth_dst,
mask->eth_dst);
- ovs_skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
+ skb_postpush_rcsum(skb, eth_hdr(skb), ETH_ALEN * 2);
ether_addr_copy(flow_key->eth.src, eth_hdr(skb)->h_source);
ether_addr_copy(flow_key->eth.dst, eth_hdr(skb)->h_dest);
@@ -639,7 +637,7 @@ static int ovs_vport_output(struct net *net, struct sock *sk, struct sk_buff *sk
/* Reconstruct the MAC header. */
skb_push(skb, data->l2_len);
memcpy(skb->data, &data->l2_data, data->l2_len);
- ovs_skb_postpush_rcsum(skb, skb->data, data->l2_len);
+ skb_postpush_rcsum(skb, skb->data, data->l2_len);
skb_reset_mac_header(skb);
ovs_vport_send(vport, skb);
@@ -1160,17 +1158,26 @@ int ovs_execute_actions(struct datapath *dp, struct sk_buff *skb,
const struct sw_flow_actions *acts,
struct sw_flow_key *key)
{
- int level = this_cpu_read(exec_actions_level);
- int err;
+ static const int ovs_recursion_limit = 5;
+ int err, level;
+
+ level = __this_cpu_inc_return(exec_actions_level);
+ if (unlikely(level > ovs_recursion_limit)) {
+ net_crit_ratelimited("ovs: recursion limit reached on datapath %s, probable configuration error\n",
+ ovs_dp_name(dp));
+ kfree_skb(skb);
+ err = -ENETDOWN;
+ goto out;
+ }
- this_cpu_inc(exec_actions_level);
err = do_execute_actions(dp, skb, key,
acts->actions, acts->actions_len);
- if (!level)
+ if (level == 1)
process_deferred_actions(dp);
- this_cpu_dec(exec_actions_level);
+out:
+ __this_cpu_dec(exec_actions_level);
return err;
}
diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index c2cc11168fd5..1b9d286756be 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -13,21 +13,31 @@
#include <linux/module.h>
#include <linux/openvswitch.h>
+#include <linux/tcp.h>
+#include <linux/udp.h>
+#include <linux/sctp.h>
#include <net/ip.h>
#include <net/netfilter/nf_conntrack_core.h>
#include <net/netfilter/nf_conntrack_helper.h>
#include <net/netfilter/nf_conntrack_labels.h>
+#include <net/netfilter/nf_conntrack_seqadj.h>
#include <net/netfilter/nf_conntrack_zones.h>
#include <net/netfilter/ipv6/nf_defrag_ipv6.h>
+#ifdef CONFIG_NF_NAT_NEEDED
+#include <linux/netfilter/nf_nat.h>
+#include <net/netfilter/nf_nat_core.h>
+#include <net/netfilter/nf_nat_l3proto.h>
+#endif
+
#include "datapath.h"
#include "conntrack.h"
#include "flow.h"
#include "flow_netlink.h"
struct ovs_ct_len_tbl {
- size_t maxlen;
- size_t minlen;
+ int maxlen;
+ int minlen;
};
/* Metadata mark for masked write to conntrack mark */
@@ -42,17 +52,29 @@ struct md_labels {
struct ovs_key_ct_labels mask;
};
+enum ovs_ct_nat {
+ OVS_CT_NAT = 1 << 0, /* NAT for committed connections only. */
+ OVS_CT_SRC_NAT = 1 << 1, /* Source NAT for NEW connections. */
+ OVS_CT_DST_NAT = 1 << 2, /* Destination NAT for NEW connections. */
+};
+
/* Conntrack action context for execution. */
struct ovs_conntrack_info {
struct nf_conntrack_helper *helper;
struct nf_conntrack_zone zone;
struct nf_conn *ct;
u8 commit : 1;
+ u8 nat : 3; /* enum ovs_ct_nat */
u16 family;
struct md_mark mark;
struct md_labels labels;
+#ifdef CONFIG_NF_NAT_NEEDED
+ struct nf_nat_range range; /* Only present for SRC NAT and DST NAT. */
+#endif
};
+static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info);
+
static u16 key_to_nfproto(const struct sw_flow_key *key)
{
switch (ntohs(key->eth.type)) {
@@ -73,7 +95,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
switch (ctinfo) {
case IP_CT_ESTABLISHED_REPLY:
case IP_CT_RELATED_REPLY:
- case IP_CT_NEW_REPLY:
ct_state |= OVS_CS_F_REPLY_DIR;
break;
default:
@@ -90,7 +111,6 @@ static u8 ovs_ct_get_state(enum ip_conntrack_info ctinfo)
ct_state |= OVS_CS_F_RELATED;
break;
case IP_CT_NEW:
- case IP_CT_NEW_REPLY:
ct_state |= OVS_CS_F_NEW;
break;
default:
@@ -137,11 +157,15 @@ static void __ovs_ct_update_key(struct sw_flow_key *key, u8 state,
ovs_ct_get_labels(ct, &key->ct.labels);
}
-/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
- * previously sent the packet to conntrack via the ct action.
+/* Update 'key' based on skb->nfct. If 'post_ct' is true, then OVS has
+ * previously sent the packet to conntrack via the ct action. If
+ * 'keep_nat_flags' is true, the existing NAT flags retained, else they are
+ * initialized from the connection status.
*/
static void ovs_ct_update_key(const struct sk_buff *skb,
- struct sw_flow_key *key, bool post_ct)
+ const struct ovs_conntrack_info *info,
+ struct sw_flow_key *key, bool post_ct,
+ bool keep_nat_flags)
{
const struct nf_conntrack_zone *zone = &nf_ct_zone_dflt;
enum ip_conntrack_info ctinfo;
@@ -151,20 +175,37 @@ static void ovs_ct_update_key(const struct sk_buff *skb,
ct = nf_ct_get(skb, &ctinfo);
if (ct) {
state = ovs_ct_get_state(ctinfo);
+ /* All unconfirmed entries are NEW connections. */
if (!nf_ct_is_confirmed(ct))
state |= OVS_CS_F_NEW;
+ /* OVS persists the related flag for the duration of the
+ * connection.
+ */
if (ct->master)
state |= OVS_CS_F_RELATED;
+ if (keep_nat_flags) {
+ state |= key->ct.state & OVS_CS_F_NAT_MASK;
+ } else {
+ if (ct->status & IPS_SRC_NAT)
+ state |= OVS_CS_F_SRC_NAT;
+ if (ct->status & IPS_DST_NAT)
+ state |= OVS_CS_F_DST_NAT;
+ }
zone = nf_ct_zone(ct);
} else if (post_ct) {
state = OVS_CS_F_TRACKED | OVS_CS_F_INVALID;
+ if (info)
+ zone = &info->zone;
}
__ovs_ct_update_key(key, state, zone, ct);
}
+/* This is called to initialize CT key fields possibly coming in from the local
+ * stack.
+ */
void ovs_ct_fill_key(const struct sk_buff *skb, struct sw_flow_key *key)
{
- ovs_ct_update_key(skb, key, false);
+ ovs_ct_update_key(skb, NULL, key, false, false);
}
int ovs_ct_put_key(const struct sw_flow_key *key, struct sk_buff *skb)
@@ -196,7 +237,6 @@ static int ovs_ct_set_mark(struct sk_buff *skb, struct sw_flow_key *key,
struct nf_conn *ct;
u32 new_mark;
-
/* The connection could be invalid, in which case set_mark is no-op. */
ct = nf_ct_get(skb, &ctinfo);
if (!ct)
@@ -254,6 +294,7 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
enum ip_conntrack_info ctinfo;
unsigned int protoff;
struct nf_conn *ct;
+ int err;
ct = nf_ct_get(skb, &ctinfo);
if (!ct || ctinfo == IP_CT_RELATED_REPLY)
@@ -290,7 +331,18 @@ static int ovs_ct_helper(struct sk_buff *skb, u16 proto)
return NF_DROP;
}
- return helper->help(skb, protoff, ct, ctinfo);
+ err = helper->help(skb, protoff, ct, ctinfo);
+ if (err != NF_ACCEPT)
+ return err;
+
+ /* Adjust seqs after helper. This is needed due to some helpers (e.g.,
+ * FTP with NAT) adusting the TCP payload size when mangling IP
+ * addresses and/or port numbers in the text-based control connection.
+ */
+ if (test_bit(IPS_SEQ_ADJUST_BIT, &ct->status) &&
+ !nf_ct_seq_adjust(skb, ct, ctinfo, protoff))
+ return NF_DROP;
+ return NF_ACCEPT;
}
/* Returns 0 on success, -EINPROGRESS if 'skb' is stolen, or other nonzero
@@ -300,10 +352,10 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
u16 zone, struct sk_buff *skb)
{
struct ovs_skb_cb ovs_cb = *OVS_CB(skb);
+ int err;
if (key->eth.type == htons(ETH_P_IP)) {
enum ip_defrag_users user = IP_DEFRAG_CONNTRACK_IN + zone;
- int err;
memset(IPCB(skb), 0, sizeof(struct inet_skb_parm));
err = ip_defrag(net, skb, user);
@@ -314,28 +366,13 @@ static int handle_fragments(struct net *net, struct sw_flow_key *key,
#if IS_ENABLED(CONFIG_NF_DEFRAG_IPV6)
} else if (key->eth.type == htons(ETH_P_IPV6)) {
enum ip6_defrag_users user = IP6_DEFRAG_CONNTRACK_IN + zone;
- struct sk_buff *reasm;
memset(IP6CB(skb), 0, sizeof(struct inet6_skb_parm));
- reasm = nf_ct_frag6_gather(net, skb, user);
- if (!reasm)
- return -EINPROGRESS;
-
- if (skb == reasm) {
- kfree_skb(skb);
- return -EINVAL;
- }
-
- /* Don't free 'skb' even though it is one of the original
- * fragments, as we're going to morph it into the head.
- */
- skb_get(skb);
- nf_ct_frag6_consume_orig(reasm);
+ err = nf_ct_frag6_gather(net, skb, user);
+ if (err)
+ return err;
- key->ip.proto = ipv6_hdr(reasm)->nexthdr;
- skb_morph(skb, reasm);
- skb->next = reasm->next;
- consume_skb(reasm);
+ key->ip.proto = ipv6_hdr(skb)->nexthdr;
ovs_cb.mru = IP6CB(skb)->frag_max_size;
#endif
} else {
@@ -362,14 +399,101 @@ ovs_ct_expect_find(struct net *net, const struct nf_conntrack_zone *zone,
return __nf_ct_expect_find(net, zone, &tuple);
}
+/* This replicates logic from nf_conntrack_core.c that is not exported. */
+static enum ip_conntrack_info
+ovs_ct_get_info(const struct nf_conntrack_tuple_hash *h)
+{
+ const struct nf_conn *ct = nf_ct_tuplehash_to_ctrack(h);
+
+ if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
+ return IP_CT_ESTABLISHED_REPLY;
+ /* Once we've had two way comms, always ESTABLISHED. */
+ if (test_bit(IPS_SEEN_REPLY_BIT, &ct->status))
+ return IP_CT_ESTABLISHED;
+ if (test_bit(IPS_EXPECTED_BIT, &ct->status))
+ return IP_CT_RELATED;
+ return IP_CT_NEW;
+}
+
+/* Find an existing connection which this packet belongs to without
+ * re-attributing statistics or modifying the connection state. This allows an
+ * skb->nfct lost due to an upcall to be recovered during actions execution.
+ *
+ * Must be called with rcu_read_lock.
+ *
+ * On success, populates skb->nfct and skb->nfctinfo, and returns the
+ * connection. Returns NULL if there is no existing entry.
+ */
+static struct nf_conn *
+ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
+ u8 l3num, struct sk_buff *skb)
+{
+ struct nf_conntrack_l3proto *l3proto;
+ struct nf_conntrack_l4proto *l4proto;
+ struct nf_conntrack_tuple tuple;
+ struct nf_conntrack_tuple_hash *h;
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+ unsigned int dataoff;
+ u8 protonum;
+
+ l3proto = __nf_ct_l3proto_find(l3num);
+ if (!l3proto) {
+ pr_debug("ovs_ct_find_existing: Can't get l3proto\n");
+ return NULL;
+ }
+ if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
+ &protonum) <= 0) {
+ pr_debug("ovs_ct_find_existing: Can't get protonum\n");
+ return NULL;
+ }
+ l4proto = __nf_ct_l4proto_find(l3num, protonum);
+ if (!l4proto) {
+ pr_debug("ovs_ct_find_existing: Can't get l4proto\n");
+ return NULL;
+ }
+ if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
+ protonum, net, &tuple, l3proto, l4proto)) {
+ pr_debug("ovs_ct_find_existing: Can't get tuple\n");
+ return NULL;
+ }
+
+ /* look for tuple match */
+ h = nf_conntrack_find_get(net, zone, &tuple);
+ if (!h)
+ return NULL; /* Not found. */
+
+ ct = nf_ct_tuplehash_to_ctrack(h);
+
+ ctinfo = ovs_ct_get_info(h);
+ if (ctinfo == IP_CT_NEW) {
+ /* This should not happen. */
+ WARN_ONCE(1, "ovs_ct_find_existing: new packet for %p\n", ct);
+ }
+ skb->nfct = &ct->ct_general;
+ skb->nfctinfo = ctinfo;
+ return ct;
+}
+
/* Determine whether skb->nfct is equal to the result of conntrack lookup. */
-static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
- const struct ovs_conntrack_info *info)
+static bool skb_nfct_cached(struct net *net,
+ const struct sw_flow_key *key,
+ const struct ovs_conntrack_info *info,
+ struct sk_buff *skb)
{
enum ip_conntrack_info ctinfo;
struct nf_conn *ct;
ct = nf_ct_get(skb, &ctinfo);
+ /* If no ct, check if we have evidence that an existing conntrack entry
+ * might be found for this skb. This happens when we lose a skb->nfct
+ * due to an upcall. If the connection was not confirmed, it is not
+ * cached and needs to be run through conntrack again.
+ */
+ if (!ct && key->ct.state & OVS_CS_F_TRACKED &&
+ !(key->ct.state & OVS_CS_F_INVALID) &&
+ key->ct.zone == info->zone.id)
+ ct = ovs_ct_find_existing(net, &info->zone, info->family, skb);
if (!ct)
return false;
if (!net_eq(net, read_pnet(&ct->ct_net)))
@@ -387,6 +511,207 @@ static bool skb_nfct_cached(const struct net *net, const struct sk_buff *skb,
return true;
}
+#ifdef CONFIG_NF_NAT_NEEDED
+/* Modelled after nf_nat_ipv[46]_fn().
+ * range is only used for new, uninitialized NAT state.
+ * Returns either NF_ACCEPT or NF_DROP.
+ */
+static int ovs_ct_nat_execute(struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo,
+ const struct nf_nat_range *range,
+ enum nf_nat_manip_type maniptype)
+{
+ int hooknum, nh_off, err = NF_ACCEPT;
+
+ nh_off = skb_network_offset(skb);
+ skb_pull(skb, nh_off);
+
+ /* See HOOK2MANIP(). */
+ if (maniptype == NF_NAT_MANIP_SRC)
+ hooknum = NF_INET_LOCAL_IN; /* Source NAT */
+ else
+ hooknum = NF_INET_LOCAL_OUT; /* Destination NAT */
+
+ switch (ctinfo) {
+ case IP_CT_RELATED:
+ case IP_CT_RELATED_REPLY:
+ if (IS_ENABLED(CONFIG_NF_NAT_IPV4) &&
+ skb->protocol == htons(ETH_P_IP) &&
+ ip_hdr(skb)->protocol == IPPROTO_ICMP) {
+ if (!nf_nat_icmp_reply_translation(skb, ct, ctinfo,
+ hooknum))
+ err = NF_DROP;
+ goto push;
+ } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) &&
+ skb->protocol == htons(ETH_P_IPV6)) {
+ __be16 frag_off;
+ u8 nexthdr = ipv6_hdr(skb)->nexthdr;
+ int hdrlen = ipv6_skip_exthdr(skb,
+ sizeof(struct ipv6hdr),
+ &nexthdr, &frag_off);
+
+ if (hdrlen >= 0 && nexthdr == IPPROTO_ICMPV6) {
+ if (!nf_nat_icmpv6_reply_translation(skb, ct,
+ ctinfo,
+ hooknum,
+ hdrlen))
+ err = NF_DROP;
+ goto push;
+ }
+ }
+ /* Non-ICMP, fall thru to initialize if needed. */
+ case IP_CT_NEW:
+ /* Seen it before? This can happen for loopback, retrans,
+ * or local packets.
+ */
+ if (!nf_nat_initialized(ct, maniptype)) {
+ /* Initialize according to the NAT action. */
+ err = (range && range->flags & NF_NAT_RANGE_MAP_IPS)
+ /* Action is set up to establish a new
+ * mapping.
+ */
+ ? nf_nat_setup_info(ct, range, maniptype)
+ : nf_nat_alloc_null_binding(ct, hooknum);
+ if (err != NF_ACCEPT)
+ goto push;
+ }
+ break;
+
+ case IP_CT_ESTABLISHED:
+ case IP_CT_ESTABLISHED_REPLY:
+ break;
+
+ default:
+ err = NF_DROP;
+ goto push;
+ }
+
+ err = nf_nat_packet(ct, ctinfo, hooknum, skb);
+push:
+ skb_push(skb, nh_off);
+
+ return err;
+}
+
+static void ovs_nat_update_key(struct sw_flow_key *key,
+ const struct sk_buff *skb,
+ enum nf_nat_manip_type maniptype)
+{
+ if (maniptype == NF_NAT_MANIP_SRC) {
+ __be16 src;
+
+ key->ct.state |= OVS_CS_F_SRC_NAT;
+ if (key->eth.type == htons(ETH_P_IP))
+ key->ipv4.addr.src = ip_hdr(skb)->saddr;
+ else if (key->eth.type == htons(ETH_P_IPV6))
+ memcpy(&key->ipv6.addr.src, &ipv6_hdr(skb)->saddr,
+ sizeof(key->ipv6.addr.src));
+ else
+ return;
+
+ if (key->ip.proto == IPPROTO_UDP)
+ src = udp_hdr(skb)->source;
+ else if (key->ip.proto == IPPROTO_TCP)
+ src = tcp_hdr(skb)->source;
+ else if (key->ip.proto == IPPROTO_SCTP)
+ src = sctp_hdr(skb)->source;
+ else
+ return;
+
+ key->tp.src = src;
+ } else {
+ __be16 dst;
+
+ key->ct.state |= OVS_CS_F_DST_NAT;
+ if (key->eth.type == htons(ETH_P_IP))
+ key->ipv4.addr.dst = ip_hdr(skb)->daddr;
+ else if (key->eth.type == htons(ETH_P_IPV6))
+ memcpy(&key->ipv6.addr.dst, &ipv6_hdr(skb)->daddr,
+ sizeof(key->ipv6.addr.dst));
+ else
+ return;
+
+ if (key->ip.proto == IPPROTO_UDP)
+ dst = udp_hdr(skb)->dest;
+ else if (key->ip.proto == IPPROTO_TCP)
+ dst = tcp_hdr(skb)->dest;
+ else if (key->ip.proto == IPPROTO_SCTP)
+ dst = sctp_hdr(skb)->dest;
+ else
+ return;
+
+ key->tp.dst = dst;
+ }
+}
+
+/* Returns NF_DROP if the packet should be dropped, NF_ACCEPT otherwise. */
+static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
+ const struct ovs_conntrack_info *info,
+ struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ enum nf_nat_manip_type maniptype;
+ int err;
+
+ if (nf_ct_is_untracked(ct)) {
+ /* A NAT action may only be performed on tracked packets. */
+ return NF_ACCEPT;
+ }
+
+ /* Add NAT extension if not confirmed yet. */
+ if (!nf_ct_is_confirmed(ct) && !nf_ct_nat_ext_add(ct))
+ return NF_ACCEPT; /* Can't NAT. */
+
+ /* Determine NAT type.
+ * Check if the NAT type can be deduced from the tracked connection.
+ * Make sure new expected connections (IP_CT_RELATED) are NATted only
+ * when committing.
+ */
+ if (info->nat & OVS_CT_NAT && ctinfo != IP_CT_NEW &&
+ ct->status & IPS_NAT_MASK &&
+ (ctinfo != IP_CT_RELATED || info->commit)) {
+ /* NAT an established or related connection like before. */
+ if (CTINFO2DIR(ctinfo) == IP_CT_DIR_REPLY)
+ /* This is the REPLY direction for a connection
+ * for which NAT was applied in the forward
+ * direction. Do the reverse NAT.
+ */
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_DST : NF_NAT_MANIP_SRC;
+ else
+ maniptype = ct->status & IPS_SRC_NAT
+ ? NF_NAT_MANIP_SRC : NF_NAT_MANIP_DST;
+ } else if (info->nat & OVS_CT_SRC_NAT) {
+ maniptype = NF_NAT_MANIP_SRC;
+ } else if (info->nat & OVS_CT_DST_NAT) {
+ maniptype = NF_NAT_MANIP_DST;
+ } else {
+ return NF_ACCEPT; /* Connection is not NATed. */
+ }
+ err = ovs_ct_nat_execute(skb, ct, ctinfo, &info->range, maniptype);
+
+ /* Mark NAT done if successful and update the flow key. */
+ if (err == NF_ACCEPT)
+ ovs_nat_update_key(key, skb, maniptype);
+
+ return err;
+}
+#else /* !CONFIG_NF_NAT_NEEDED */
+static int ovs_ct_nat(struct net *net, struct sw_flow_key *key,
+ const struct ovs_conntrack_info *info,
+ struct sk_buff *skb, struct nf_conn *ct,
+ enum ip_conntrack_info ctinfo)
+{
+ return NF_ACCEPT;
+}
+#endif
+
+/* Pass 'skb' through conntrack in 'net', using zone configured in 'info', if
+ * not done already. Update key with new CT state after passing the packet
+ * through conntrack.
+ * Note that if the packet is deemed invalid by conntrack, skb->nfct will be
+ * set to NULL and 0 will be returned.
+ */
static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
struct sk_buff *skb)
@@ -396,8 +721,13 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
* actually run the packet through conntrack twice unless it's for a
* different zone.
*/
- if (!skb_nfct_cached(net, skb, info)) {
+ bool cached = skb_nfct_cached(net, key, info, skb);
+ enum ip_conntrack_info ctinfo;
+ struct nf_conn *ct;
+
+ if (!cached) {
struct nf_conn *tmpl = info->ct;
+ int err;
/* Associate skb with specified zone. */
if (tmpl) {
@@ -408,18 +738,54 @@ static int __ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
skb->nfctinfo = IP_CT_NEW;
}
- if (nf_conntrack_in(net, info->family, NF_INET_PRE_ROUTING,
- skb) != NF_ACCEPT)
+ /* Repeat if requested, see nf_iterate(). */
+ do {
+ err = nf_conntrack_in(net, info->family,
+ NF_INET_PRE_ROUTING, skb);
+ } while (err == NF_REPEAT);
+
+ if (err != NF_ACCEPT)
return -ENOENT;
- if (ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
- WARN_ONCE(1, "helper rejected packet");
+ /* Clear CT state NAT flags to mark that we have not yet done
+ * NAT after the nf_conntrack_in() call. We can actually clear
+ * the whole state, as it will be re-initialized below.
+ */
+ key->ct.state = 0;
+
+ /* Update the key, but keep the NAT flags. */
+ ovs_ct_update_key(skb, info, key, true, true);
+ }
+
+ ct = nf_ct_get(skb, &ctinfo);
+ if (ct) {
+ /* Packets starting a new connection must be NATted before the
+ * helper, so that the helper knows about the NAT. We enforce
+ * this by delaying both NAT and helper calls for unconfirmed
+ * connections until the committing CT action. For later
+ * packets NAT and Helper may be called in either order.
+ *
+ * NAT will be done only if the CT action has NAT, and only
+ * once per packet (per zone), as guarded by the NAT bits in
+ * the key->ct.state.
+ */
+ if (info->nat && !(key->ct.state & OVS_CS_F_NAT_MASK) &&
+ (nf_ct_is_confirmed(ct) || info->commit) &&
+ ovs_ct_nat(net, key, info, skb, ct, ctinfo) != NF_ACCEPT) {
+ return -EINVAL;
+ }
+
+ /* Call the helper only if:
+ * - nf_conntrack_in() was executed above ("!cached") for a
+ * confirmed connection, or
+ * - When committing an unconfirmed connection.
+ */
+ if ((nf_ct_is_confirmed(ct) ? !cached : info->commit) &&
+ ovs_ct_helper(skb, info->family) != NF_ACCEPT) {
return -EINVAL;
}
}
- ovs_ct_update_key(skb, key, true);
-
return 0;
}
@@ -430,19 +796,24 @@ static int ovs_ct_lookup(struct net *net, struct sw_flow_key *key,
{
struct nf_conntrack_expect *exp;
+ /* If we pass an expected packet through nf_conntrack_in() the
+ * expectation is typically removed, but the packet could still be
+ * lost in upcall processing. To prevent this from happening we
+ * perform an explicit expectation lookup. Expected connections are
+ * always new, and will be passed through conntrack only when they are
+ * committed, as it is OK to remove the expectation at that time.
+ */
exp = ovs_ct_expect_find(net, &info->zone, info->family, skb);
if (exp) {
u8 state;
+ /* NOTE: New connections are NATted and Helped only when
+ * committed, so we are not calling into NAT here.
+ */
state = OVS_CS_F_TRACKED | OVS_CS_F_NEW | OVS_CS_F_RELATED;
__ovs_ct_update_key(key, state, &info->zone, exp->master);
- } else {
- int err;
-
- err = __ovs_ct_lookup(net, key, info, skb);
- if (err)
- return err;
- }
+ } else
+ return __ovs_ct_lookup(net, key, info, skb);
return 0;
}
@@ -452,21 +823,12 @@ static int ovs_ct_commit(struct net *net, struct sw_flow_key *key,
const struct ovs_conntrack_info *info,
struct sk_buff *skb)
{
- u8 state;
int err;
- state = key->ct.state;
- if (key->ct.zone == info->zone.id &&
- ((state & OVS_CS_F_TRACKED) && !(state & OVS_CS_F_NEW))) {
- /* Previous lookup has shown that this connection is already
- * tracked and committed. Skip committing.
- */
- return 0;
- }
-
err = __ovs_ct_lookup(net, key, info, skb);
if (err)
return err;
+ /* This is a no-op if the connection has already been confirmed. */
if (nf_conntrack_confirm(skb) != NF_ACCEPT)
return -EINVAL;
@@ -551,6 +913,136 @@ static int ovs_ct_add_helper(struct ovs_conntrack_info *info, const char *name,
return 0;
}
+#ifdef CONFIG_NF_NAT_NEEDED
+static int parse_nat(const struct nlattr *attr,
+ struct ovs_conntrack_info *info, bool log)
+{
+ struct nlattr *a;
+ int rem;
+ bool have_ip_max = false;
+ bool have_proto_max = false;
+ bool ip_vers = (info->family == NFPROTO_IPV6);
+
+ nla_for_each_nested(a, attr, rem) {
+ static const int ovs_nat_attr_lens[OVS_NAT_ATTR_MAX + 1][2] = {
+ [OVS_NAT_ATTR_SRC] = {0, 0},
+ [OVS_NAT_ATTR_DST] = {0, 0},
+ [OVS_NAT_ATTR_IP_MIN] = {sizeof(struct in_addr),
+ sizeof(struct in6_addr)},
+ [OVS_NAT_ATTR_IP_MAX] = {sizeof(struct in_addr),
+ sizeof(struct in6_addr)},
+ [OVS_NAT_ATTR_PROTO_MIN] = {sizeof(u16), sizeof(u16)},
+ [OVS_NAT_ATTR_PROTO_MAX] = {sizeof(u16), sizeof(u16)},
+ [OVS_NAT_ATTR_PERSISTENT] = {0, 0},
+ [OVS_NAT_ATTR_PROTO_HASH] = {0, 0},
+ [OVS_NAT_ATTR_PROTO_RANDOM] = {0, 0},
+ };
+ int type = nla_type(a);
+
+ if (type > OVS_NAT_ATTR_MAX) {
+ OVS_NLERR(log,
+ "Unknown NAT attribute (type=%d, max=%d).\n",
+ type, OVS_NAT_ATTR_MAX);
+ return -EINVAL;
+ }
+
+ if (nla_len(a) != ovs_nat_attr_lens[type][ip_vers]) {
+ OVS_NLERR(log,
+ "NAT attribute type %d has unexpected length (%d != %d).\n",
+ type, nla_len(a),
+ ovs_nat_attr_lens[type][ip_vers]);
+ return -EINVAL;
+ }
+
+ switch (type) {
+ case OVS_NAT_ATTR_SRC:
+ case OVS_NAT_ATTR_DST:
+ if (info->nat) {
+ OVS_NLERR(log,
+ "Only one type of NAT may be specified.\n"
+ );
+ return -ERANGE;
+ }
+ info->nat |= OVS_CT_NAT;
+ info->nat |= ((type == OVS_NAT_ATTR_SRC)
+ ? OVS_CT_SRC_NAT : OVS_CT_DST_NAT);
+ break;
+
+ case OVS_NAT_ATTR_IP_MIN:
+ nla_memcpy(&info->range.min_addr, a,
+ sizeof(info->range.min_addr));
+ info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+ break;
+
+ case OVS_NAT_ATTR_IP_MAX:
+ have_ip_max = true;
+ nla_memcpy(&info->range.max_addr, a,
+ sizeof(info->range.max_addr));
+ info->range.flags |= NF_NAT_RANGE_MAP_IPS;
+ break;
+
+ case OVS_NAT_ATTR_PROTO_MIN:
+ info->range.min_proto.all = htons(nla_get_u16(a));
+ info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ break;
+
+ case OVS_NAT_ATTR_PROTO_MAX:
+ have_proto_max = true;
+ info->range.max_proto.all = htons(nla_get_u16(a));
+ info->range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+ break;
+
+ case OVS_NAT_ATTR_PERSISTENT:
+ info->range.flags |= NF_NAT_RANGE_PERSISTENT;
+ break;
+
+ case OVS_NAT_ATTR_PROTO_HASH:
+ info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM;
+ break;
+
+ case OVS_NAT_ATTR_PROTO_RANDOM:
+ info->range.flags |= NF_NAT_RANGE_PROTO_RANDOM_FULLY;
+ break;
+
+ default:
+ OVS_NLERR(log, "Unknown nat attribute (%d).\n", type);
+ return -EINVAL;
+ }
+ }
+
+ if (rem > 0) {
+ OVS_NLERR(log, "NAT attribute has %d unknown bytes.\n", rem);
+ return -EINVAL;
+ }
+ if (!info->nat) {
+ /* Do not allow flags if no type is given. */
+ if (info->range.flags) {
+ OVS_NLERR(log,
+ "NAT flags may be given only when NAT range (SRC or DST) is also specified.\n"
+ );
+ return -EINVAL;
+ }
+ info->nat = OVS_CT_NAT; /* NAT existing connections. */
+ } else if (!info->commit) {
+ OVS_NLERR(log,
+ "NAT attributes may be specified only when CT COMMIT flag is also specified.\n"
+ );
+ return -EINVAL;
+ }
+ /* Allow missing IP_MAX. */
+ if (info->range.flags & NF_NAT_RANGE_MAP_IPS && !have_ip_max) {
+ memcpy(&info->range.max_addr, &info->range.min_addr,
+ sizeof(info->range.max_addr));
+ }
+ /* Allow missing PROTO_MAX. */
+ if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
+ !have_proto_max) {
+ info->range.max_proto.all = info->range.min_proto.all;
+ }
+ return 0;
+}
+#endif
+
static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
[OVS_CT_ATTR_COMMIT] = { .minlen = 0, .maxlen = 0 },
[OVS_CT_ATTR_ZONE] = { .minlen = sizeof(u16),
@@ -560,7 +1052,11 @@ static const struct ovs_ct_len_tbl ovs_ct_attr_lens[OVS_CT_ATTR_MAX + 1] = {
[OVS_CT_ATTR_LABELS] = { .minlen = sizeof(struct md_labels),
.maxlen = sizeof(struct md_labels) },
[OVS_CT_ATTR_HELPER] = { .minlen = 1,
- .maxlen = NF_CT_HELPER_NAME_LEN }
+ .maxlen = NF_CT_HELPER_NAME_LEN },
+#ifdef CONFIG_NF_NAT_NEEDED
+ /* NAT length is checked when parsing the nested attributes. */
+ [OVS_CT_ATTR_NAT] = { .minlen = 0, .maxlen = INT_MAX },
+#endif
};
static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
@@ -627,6 +1123,15 @@ static int parse_ct(const struct nlattr *attr, struct ovs_conntrack_info *info,
return -EINVAL;
}
break;
+#ifdef CONFIG_NF_NAT_NEEDED
+ case OVS_CT_ATTR_NAT: {
+ int err = parse_nat(a, info, log);
+
+ if (err)
+ return err;
+ break;
+ }
+#endif
default:
OVS_NLERR(log, "Unknown conntrack attr (%d)",
type);
@@ -693,6 +1198,10 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
OVS_NLERR(log, "Failed to allocate conntrack template");
return -ENOMEM;
}
+
+ __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status);
+ nf_conntrack_get(&ct_info.ct->ct_general);
+
if (helper) {
err = ovs_ct_add_helper(&ct_info, helper, key, log);
if (err)
@@ -704,14 +1213,80 @@ int ovs_ct_copy_action(struct net *net, const struct nlattr *attr,
if (err)
goto err_free_ct;
- __set_bit(IPS_CONFIRMED_BIT, &ct_info.ct->status);
- nf_conntrack_get(&ct_info.ct->ct_general);
return 0;
err_free_ct:
- nf_conntrack_free(ct_info.ct);
+ __ovs_ct_free_action(&ct_info);
return err;
}
+#ifdef CONFIG_NF_NAT_NEEDED
+static bool ovs_ct_nat_to_attr(const struct ovs_conntrack_info *info,
+ struct sk_buff *skb)
+{
+ struct nlattr *start;
+
+ start = nla_nest_start(skb, OVS_CT_ATTR_NAT);
+ if (!start)
+ return false;
+
+ if (info->nat & OVS_CT_SRC_NAT) {
+ if (nla_put_flag(skb, OVS_NAT_ATTR_SRC))
+ return false;
+ } else if (info->nat & OVS_CT_DST_NAT) {
+ if (nla_put_flag(skb, OVS_NAT_ATTR_DST))
+ return false;
+ } else {
+ goto out;
+ }
+
+ if (info->range.flags & NF_NAT_RANGE_MAP_IPS) {
+ if (IS_ENABLED(CONFIG_NF_NAT_IPV4) &&
+ info->family == NFPROTO_IPV4) {
+ if (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MIN,
+ info->range.min_addr.ip) ||
+ (info->range.max_addr.ip
+ != info->range.min_addr.ip &&
+ (nla_put_in_addr(skb, OVS_NAT_ATTR_IP_MAX,
+ info->range.max_addr.ip))))
+ return false;
+ } else if (IS_ENABLED(CONFIG_NF_NAT_IPV6) &&
+ info->family == NFPROTO_IPV6) {
+ if (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MIN,
+ &info->range.min_addr.in6) ||
+ (memcmp(&info->range.max_addr.in6,
+ &info->range.min_addr.in6,
+ sizeof(info->range.max_addr.in6)) &&
+ (nla_put_in6_addr(skb, OVS_NAT_ATTR_IP_MAX,
+ &info->range.max_addr.in6))))
+ return false;
+ } else {
+ return false;
+ }
+ }
+ if (info->range.flags & NF_NAT_RANGE_PROTO_SPECIFIED &&
+ (nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MIN,
+ ntohs(info->range.min_proto.all)) ||
+ (info->range.max_proto.all != info->range.min_proto.all &&
+ nla_put_u16(skb, OVS_NAT_ATTR_PROTO_MAX,
+ ntohs(info->range.max_proto.all)))))
+ return false;
+
+ if (info->range.flags & NF_NAT_RANGE_PERSISTENT &&
+ nla_put_flag(skb, OVS_NAT_ATTR_PERSISTENT))
+ return false;
+ if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM &&
+ nla_put_flag(skb, OVS_NAT_ATTR_PROTO_HASH))
+ return false;
+ if (info->range.flags & NF_NAT_RANGE_PROTO_RANDOM_FULLY &&
+ nla_put_flag(skb, OVS_NAT_ATTR_PROTO_RANDOM))
+ return false;
+out:
+ nla_nest_end(skb, start);
+
+ return true;
+}
+#endif
+
int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
struct sk_buff *skb)
{
@@ -740,7 +1315,10 @@ int ovs_ct_action_to_attr(const struct ovs_conntrack_info *ct_info,
ct_info->helper->name))
return -EMSGSIZE;
}
-
+#ifdef CONFIG_NF_NAT_NEEDED
+ if (ct_info->nat && !ovs_ct_nat_to_attr(ct_info, skb))
+ return -EMSGSIZE;
+#endif
nla_nest_end(skb, start);
return 0;
@@ -750,6 +1328,11 @@ void ovs_ct_free_action(const struct nlattr *a)
{
struct ovs_conntrack_info *ct_info = nla_data(a);
+ __ovs_ct_free_action(ct_info);
+}
+
+static void __ovs_ct_free_action(struct ovs_conntrack_info *ct_info)
+{
if (ct_info->helper)
module_put(ct_info->helper->me);
if (ct_info->ct)
diff --git a/net/openvswitch/conntrack.h b/net/openvswitch/conntrack.h
index a7544f405c16..8f6230bd6183 100644
--- a/net/openvswitch/conntrack.h
+++ b/net/openvswitch/conntrack.h
@@ -37,7 +37,8 @@ void ovs_ct_free_action(const struct nlattr *a);
#define CT_SUPPORTED_MASK (OVS_CS_F_NEW | OVS_CS_F_ESTABLISHED | \
OVS_CS_F_RELATED | OVS_CS_F_REPLY_DIR | \
- OVS_CS_F_INVALID | OVS_CS_F_TRACKED)
+ OVS_CS_F_INVALID | OVS_CS_F_TRACKED | \
+ OVS_CS_F_SRC_NAT | OVS_CS_F_DST_NAT)
#else
#include <linux/errno.h>
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 91a8b004dc51..0cc66a4e492d 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -336,12 +336,10 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
unsigned short gso_type = skb_shinfo(skb)->gso_type;
struct sw_flow_key later_key;
struct sk_buff *segs, *nskb;
- struct ovs_skb_cb ovs_cb;
int err;
- ovs_cb = *OVS_CB(skb);
+ BUILD_BUG_ON(sizeof(*OVS_CB(skb)) > SKB_SGO_CB_OFFSET);
segs = __skb_gso_segment(skb, NETIF_F_SG, false);
- *OVS_CB(skb) = ovs_cb;
if (IS_ERR(segs))
return PTR_ERR(segs);
if (segs == NULL)
@@ -359,7 +357,6 @@ static int queue_gso_packets(struct datapath *dp, struct sk_buff *skb,
/* Queue all of the segments. */
skb = segs;
do {
- *OVS_CB(skb) = ovs_cb;
if (gso_type & SKB_GSO_UDP && skb != segs)
key = &later_key;
@@ -425,10 +422,6 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
struct sk_buff *nskb = NULL;
struct sk_buff *user_skb = NULL; /* to be queued to userspace */
struct nlattr *nla;
- struct genl_info info = {
- .dst_sk = ovs_dp_get_net(dp)->genl_sock,
- .snd_portid = upcall_info->portid,
- };
size_t len;
unsigned int hlen;
int err, dp_ifindex;
@@ -469,7 +462,7 @@ static int queue_userspace_packet(struct datapath *dp, struct sk_buff *skb,
hlen = skb->len;
len = upcall_msg_size(upcall_info, hlen);
- user_skb = genlmsg_new_unicast(len, &info, GFP_ATOMIC);
+ user_skb = genlmsg_new(len, GFP_ATOMIC);
if (!user_skb) {
err = -ENOMEM;
goto out;
@@ -657,7 +650,7 @@ static const struct nla_policy packet_policy[OVS_PACKET_ATTR_MAX + 1] = {
static const struct genl_ops dp_packet_genl_ops[] = {
{ .cmd = OVS_PACKET_CMD_EXECUTE,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = packet_policy,
.doit = ovs_packet_cmd_execute
}
@@ -879,7 +872,7 @@ static struct sk_buff *ovs_flow_cmd_alloc_info(const struct sw_flow_actions *act
return NULL;
len = ovs_flow_cmd_msg_size(acts, sfid, ufid_flags);
- skb = genlmsg_new_unicast(len, info, GFP_KERNEL);
+ skb = genlmsg_new(len, GFP_KERNEL);
if (!skb)
return ERR_PTR(-ENOMEM);
@@ -1103,26 +1096,32 @@ static int ovs_flow_cmd_set(struct sk_buff *skb, struct genl_info *info)
struct sw_flow_match match;
struct sw_flow_id sfid;
u32 ufid_flags = ovs_nla_get_ufid_flags(a[OVS_FLOW_ATTR_UFID_FLAGS]);
- int error;
+ int error = 0;
bool log = !a[OVS_FLOW_ATTR_PROBE];
bool ufid_present;
- /* Extract key. */
- error = -EINVAL;
- if (!a[OVS_FLOW_ATTR_KEY]) {
- OVS_NLERR(log, "Flow key attribute not present in set flow.");
- goto error;
- }
-
ufid_present = ovs_nla_get_ufid(&sfid, a[OVS_FLOW_ATTR_UFID], log);
- ovs_match_init(&match, &key, &mask);
- error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
- a[OVS_FLOW_ATTR_MASK], log);
+ if (a[OVS_FLOW_ATTR_KEY]) {
+ ovs_match_init(&match, &key, &mask);
+ error = ovs_nla_get_match(net, &match, a[OVS_FLOW_ATTR_KEY],
+ a[OVS_FLOW_ATTR_MASK], log);
+ } else if (!ufid_present) {
+ OVS_NLERR(log,
+ "Flow set message rejected, Key attribute missing.");
+ error = -EINVAL;
+ }
if (error)
goto error;
/* Validate actions. */
if (a[OVS_FLOW_ATTR_ACTIONS]) {
+ if (!a[OVS_FLOW_ATTR_KEY]) {
+ OVS_NLERR(log,
+ "Flow key attribute not present in set flow.");
+ error = -EINVAL;
+ goto error;
+ }
+
acts = get_flow_actions(net, a[OVS_FLOW_ATTR_ACTIONS], &key,
&mask, log);
if (IS_ERR(acts)) {
@@ -1394,12 +1393,12 @@ static const struct nla_policy flow_policy[OVS_FLOW_ATTR_MAX + 1] = {
static const struct genl_ops dp_flow_genl_ops[] = {
{ .cmd = OVS_FLOW_CMD_NEW,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = flow_policy,
.doit = ovs_flow_cmd_new
},
{ .cmd = OVS_FLOW_CMD_DEL,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = flow_policy,
.doit = ovs_flow_cmd_del
},
@@ -1410,7 +1409,7 @@ static const struct genl_ops dp_flow_genl_ops[] = {
.dumpit = ovs_flow_cmd_dump
},
{ .cmd = OVS_FLOW_CMD_SET,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = flow_policy,
.doit = ovs_flow_cmd_set,
},
@@ -1484,9 +1483,9 @@ error:
return -EMSGSIZE;
}
-static struct sk_buff *ovs_dp_cmd_alloc_info(struct genl_info *info)
+static struct sk_buff *ovs_dp_cmd_alloc_info(void)
{
- return genlmsg_new_unicast(ovs_dp_cmd_msg_size(), info, GFP_KERNEL);
+ return genlmsg_new(ovs_dp_cmd_msg_size(), GFP_KERNEL);
}
/* Called with rcu_read_lock or ovs_mutex. */
@@ -1539,7 +1538,7 @@ static int ovs_dp_cmd_new(struct sk_buff *skb, struct genl_info *info)
if (!a[OVS_DP_ATTR_NAME] || !a[OVS_DP_ATTR_UPCALL_PID])
goto err;
- reply = ovs_dp_cmd_alloc_info(info);
+ reply = ovs_dp_cmd_alloc_info();
if (!reply)
return -ENOMEM;
@@ -1660,7 +1659,7 @@ static int ovs_dp_cmd_del(struct sk_buff *skb, struct genl_info *info)
struct datapath *dp;
int err;
- reply = ovs_dp_cmd_alloc_info(info);
+ reply = ovs_dp_cmd_alloc_info();
if (!reply)
return -ENOMEM;
@@ -1693,7 +1692,7 @@ static int ovs_dp_cmd_set(struct sk_buff *skb, struct genl_info *info)
struct datapath *dp;
int err;
- reply = ovs_dp_cmd_alloc_info(info);
+ reply = ovs_dp_cmd_alloc_info();
if (!reply)
return -ENOMEM;
@@ -1726,7 +1725,7 @@ static int ovs_dp_cmd_get(struct sk_buff *skb, struct genl_info *info)
struct datapath *dp;
int err;
- reply = ovs_dp_cmd_alloc_info(info);
+ reply = ovs_dp_cmd_alloc_info();
if (!reply)
return -ENOMEM;
@@ -1780,12 +1779,12 @@ static const struct nla_policy datapath_policy[OVS_DP_ATTR_MAX + 1] = {
static const struct genl_ops dp_datapath_genl_ops[] = {
{ .cmd = OVS_DP_CMD_NEW,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_new
},
{ .cmd = OVS_DP_CMD_DEL,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_del
},
@@ -1796,7 +1795,7 @@ static const struct genl_ops dp_datapath_genl_ops[] = {
.dumpit = ovs_dp_cmd_dump
},
{ .cmd = OVS_DP_CMD_SET,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = datapath_policy,
.doit = ovs_dp_cmd_set,
},
@@ -1915,6 +1914,29 @@ static struct vport *lookup_vport(struct net *net,
return ERR_PTR(-EINVAL);
}
+/* Called with ovs_mutex */
+static void update_headroom(struct datapath *dp)
+{
+ unsigned dev_headroom, max_headroom = 0;
+ struct net_device *dev;
+ struct vport *vport;
+ int i;
+
+ for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
+ hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
+ dev = vport->dev;
+ dev_headroom = netdev_get_fwd_headroom(dev);
+ if (dev_headroom > max_headroom)
+ max_headroom = dev_headroom;
+ }
+ }
+
+ dp->max_headroom = max_headroom;
+ for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
+ hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node)
+ netdev_set_rx_headroom(vport->dev, max_headroom);
+}
+
static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
@@ -1980,6 +2002,12 @@ restart:
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_NEW);
+
+ if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom)
+ update_headroom(dp);
+ else
+ netdev_set_rx_headroom(vport->dev, dp->max_headroom);
+
BUG_ON(err < 0);
ovs_unlock();
@@ -2046,8 +2074,10 @@ exit_unlock_free:
static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
{
+ bool must_update_headroom = false;
struct nlattr **a = info->attrs;
struct sk_buff *reply;
+ struct datapath *dp;
struct vport *vport;
int err;
@@ -2069,7 +2099,16 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_DEL);
BUG_ON(err < 0);
+
+ /* the vport deletion may trigger dp headroom update */
+ dp = vport->dp;
+ if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom)
+ must_update_headroom = true;
+ netdev_reset_rx_headroom(vport->dev);
ovs_dp_detach_port(vport);
+
+ if (must_update_headroom)
+ update_headroom(dp);
ovs_unlock();
ovs_notify(&dp_vport_genl_family, reply, info);
@@ -2161,12 +2200,12 @@ static const struct nla_policy vport_policy[OVS_VPORT_ATTR_MAX + 1] = {
static const struct genl_ops dp_vport_genl_ops[] = {
{ .cmd = OVS_VPORT_CMD_NEW,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = vport_policy,
.doit = ovs_vport_cmd_new
},
{ .cmd = OVS_VPORT_CMD_DEL,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = vport_policy,
.doit = ovs_vport_cmd_del
},
@@ -2177,7 +2216,7 @@ static const struct genl_ops dp_vport_genl_ops[] = {
.dumpit = ovs_vport_cmd_dump
},
{ .cmd = OVS_VPORT_CMD_SET,
- .flags = GENL_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
+ .flags = GENL_UNS_ADMIN_PERM, /* Requires CAP_NET_ADMIN privilege. */
.policy = vport_policy,
.doit = ovs_vport_cmd_set,
},
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 67bdecd9fdc1..427e39a045cf 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -68,6 +68,8 @@ struct dp_stats_percpu {
* ovs_mutex and RCU.
* @stats_percpu: Per-CPU datapath statistics.
* @net: Reference to net namespace.
+ * @max_headroom: the maximum headroom of all vports in this datapath; it will
+ * be used by all the internal vports in this dp.
*
* Context: See the comment on locking at the top of datapath.c for additional
* locking information.
@@ -89,6 +91,8 @@ struct datapath {
possible_net_t net;
u32 user_features;
+
+ u32 max_headroom;
};
/**
diff --git a/net/openvswitch/dp_notify.c b/net/openvswitch/dp_notify.c
index a7a80a6b77b0..653d073bae45 100644
--- a/net/openvswitch/dp_notify.c
+++ b/net/openvswitch/dp_notify.c
@@ -58,7 +58,7 @@ void ovs_dp_notify_wq(struct work_struct *work)
struct hlist_node *n;
hlist_for_each_entry_safe(vport, n, &dp->ports[i], dp_hash_node) {
- if (vport->ops->type != OVS_VPORT_TYPE_NETDEV)
+ if (vport->ops->type == OVS_VPORT_TYPE_INTERNAL)
continue;
if (!(vport->dev->priv_flags & IFF_OVS_DATAPATH))
diff --git a/net/openvswitch/flow.h b/net/openvswitch/flow.h
index 1d055c559eaf..03378e75a67c 100644
--- a/net/openvswitch/flow.h
+++ b/net/openvswitch/flow.h
@@ -55,7 +55,7 @@ struct ovs_tunnel_info {
FIELD_SIZEOF(struct sw_flow_key, recirc_id))
struct sw_flow_key {
- u8 tun_opts[255];
+ u8 tun_opts[IP_TUNNEL_OPTS_MAX];
u8 tun_opts_len;
struct ip_tunnel_key tun_key; /* Encapsulating tunnel key. */
struct {
diff --git a/net/openvswitch/flow_netlink.c b/net/openvswitch/flow_netlink.c
index 907d6fd28ede..689c17264221 100644
--- a/net/openvswitch/flow_netlink.c
+++ b/net/openvswitch/flow_netlink.c
@@ -1959,6 +1959,12 @@ static int validate_and_copy_set_tun(const struct nlattr *attr,
if (!tun_dst)
return -ENOMEM;
+ err = dst_cache_init(&tun_dst->u.tun_info.dst_cache, GFP_KERNEL);
+ if (err) {
+ dst_release((struct dst_entry *)tun_dst);
+ return err;
+ }
+
a = __add_action(sfa, OVS_KEY_ATTR_TUNNEL_INFO, NULL,
sizeof(*ovs_tun), log);
if (IS_ERR(a)) {
@@ -2038,9 +2044,6 @@ static int validate_set(const struct nlattr *a,
break;
case OVS_KEY_ATTR_TUNNEL:
- if (eth_p_mpls(eth_type))
- return -EINVAL;
-
if (masked)
return -EINVAL; /* Masked tunnel set not supported. */
@@ -2434,7 +2437,10 @@ static int set_action_to_attr(const struct nlattr *a, struct sk_buff *skb)
if (!start)
return -EMSGSIZE;
- err = ovs_nla_put_tunnel_info(skb, tun_info);
+ err = ip_tun_to_nlattr(skb, &tun_info->key,
+ ip_tunnel_info_opts(tun_info),
+ tun_info->options_len,
+ ip_tunnel_info_af(tun_info));
if (err)
return err;
nla_nest_end(skb, start);
diff --git a/net/openvswitch/vport-geneve.c b/net/openvswitch/vport-geneve.c
index 69f1de58a3b4..1a1fcec88695 100644
--- a/net/openvswitch/vport-geneve.c
+++ b/net/openvswitch/vport-geneve.c
@@ -34,7 +34,7 @@ static struct vport_ops ovs_geneve_vport_ops;
* @dst_port: destination port.
*/
struct geneve_port {
- u16 port_no;
+ u16 dst_port;
};
static inline struct geneve_port *geneve_vport(const struct vport *vport)
@@ -47,7 +47,7 @@ static int geneve_get_options(const struct vport *vport,
{
struct geneve_port *geneve_port = geneve_vport(vport);
- if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, geneve_port->port_no))
+ if (nla_put_u16(skb, OVS_TUNNEL_ATTR_DST_PORT, geneve_port->dst_port))
return -EMSGSIZE;
return 0;
}
@@ -83,7 +83,7 @@ static struct vport *geneve_tnl_create(const struct vport_parms *parms)
return vport;
geneve_port = geneve_vport(vport);
- geneve_port->port_no = dst_port;
+ geneve_port->dst_port = dst_port;
rtnl_lock();
dev = geneve_dev_create_fb(net, parms->name, NET_NAME_USER, dst_port);
@@ -117,7 +117,6 @@ static struct vport_ops ovs_geneve_vport_ops = {
.destroy = ovs_netdev_tunnel_destroy,
.get_options = geneve_get_options,
.send = dev_queue_xmit,
- .owner = THIS_MODULE,
};
static int __init ovs_geneve_tnl_init(void)
diff --git a/net/openvswitch/vport-gre.c b/net/openvswitch/vport-gre.c
index c3257d78d3d2..7f8897f33a67 100644
--- a/net/openvswitch/vport-gre.c
+++ b/net/openvswitch/vport-gre.c
@@ -89,7 +89,6 @@ static struct vport_ops ovs_gre_vport_ops = {
.create = gre_create,
.send = dev_queue_xmit,
.destroy = ovs_netdev_tunnel_destroy,
- .owner = THIS_MODULE,
};
static int __init ovs_gre_tnl_init(void)
diff --git a/net/openvswitch/vport-internal_dev.c b/net/openvswitch/vport-internal_dev.c
index ec76398a792f..7c8b90bf0e54 100644
--- a/net/openvswitch/vport-internal_dev.c
+++ b/net/openvswitch/vport-internal_dev.c
@@ -138,6 +138,11 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
return stats;
}
+static void internal_set_rx_headroom(struct net_device *dev, int new_hr)
+{
+ dev->needed_headroom = new_hr;
+}
+
static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_open = internal_dev_open,
.ndo_stop = internal_dev_stop,
@@ -145,6 +150,7 @@ static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_change_mtu = internal_dev_change_mtu,
.ndo_get_stats64 = internal_get_stats,
+ .ndo_set_rx_headroom = internal_set_rx_headroom,
};
static struct rtnl_link_ops internal_dev_link_ops __read_mostly = {
@@ -158,7 +164,8 @@ static void do_setup(struct net_device *netdev)
netdev->netdev_ops = &internal_dev_netdev_ops;
netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
- netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH;
+ netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH |
+ IFF_PHONY_HEADROOM;
netdev->destructor = internal_dev_destructor;
netdev->ethtool_ops = &internal_dev_ethtool_ops;
netdev->rtnl_link_ops = &internal_dev_link_ops;
@@ -199,6 +206,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
err = -ENOMEM;
goto error_free_netdev;
}
+ vport->dev->needed_headroom = vport->dp->max_headroom;
dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
internal_dev = internal_dev_priv(vport->dev);
diff --git a/net/openvswitch/vport-netdev.c b/net/openvswitch/vport-netdev.c
index b327368a3848..4e3972344aa6 100644
--- a/net/openvswitch/vport-netdev.c
+++ b/net/openvswitch/vport-netdev.c
@@ -58,7 +58,7 @@ static void netdev_port_receive(struct sk_buff *skb)
return;
skb_push(skb, ETH_HLEN);
- ovs_skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
+ skb_postpush_rcsum(skb, skb->data, ETH_HLEN);
ovs_vport_receive(vport, skb, skb_tunnel_info(skb));
return;
error:
@@ -105,7 +105,7 @@ struct vport *ovs_netdev_link(struct vport *vport, const char *name)
rtnl_lock();
err = netdev_master_upper_dev_link(vport->dev,
- get_dpdev(vport->dp));
+ get_dpdev(vport->dp), NULL, NULL);
if (err)
goto error_unlock;
@@ -180,9 +180,13 @@ void ovs_netdev_tunnel_destroy(struct vport *vport)
if (vport->dev->priv_flags & IFF_OVS_DATAPATH)
ovs_netdev_detach_dev(vport);
- /* Early release so we can unregister the device */
+ /* We can be invoked by both explicit vport deletion and
+ * underlying netdev deregistration; delete the link only
+ * if it's not already shutting down.
+ */
+ if (vport->dev->reg_state == NETREG_REGISTERED)
+ rtnl_delete_link(vport->dev);
dev_put(vport->dev);
- rtnl_delete_link(vport->dev);
vport->dev = NULL;
rtnl_unlock();
diff --git a/net/openvswitch/vport-vxlan.c b/net/openvswitch/vport-vxlan.c
index 1605691d9414..5eb7694348b5 100644
--- a/net/openvswitch/vport-vxlan.c
+++ b/net/openvswitch/vport-vxlan.c
@@ -90,7 +90,9 @@ static struct vport *vxlan_tnl_create(const struct vport_parms *parms)
int err;
struct vxlan_config conf = {
.no_share = true,
- .flags = VXLAN_F_COLLECT_METADATA,
+ .flags = VXLAN_F_COLLECT_METADATA | VXLAN_F_UDP_ZERO_CSUM6_RX,
+ /* Don't restrict the packets that can be sent by MTU */
+ .mtu = IP_MAX_MTU,
};
if (!options) {
diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c
index 0ac0fd004d7e..31cbc8c5c7db 100644
--- a/net/openvswitch/vport.c
+++ b/net/openvswitch/vport.c
@@ -71,7 +71,7 @@ static struct hlist_head *hash_bucket(const struct net *net, const char *name)
return &dev_table[hash & (VPORT_HASH_BUCKETS - 1)];
}
-int ovs_vport_ops_register(struct vport_ops *ops)
+int __ovs_vport_ops_register(struct vport_ops *ops)
{
int err = -EEXIST;
struct vport_ops *o;
@@ -87,7 +87,7 @@ errout:
ovs_unlock();
return err;
}
-EXPORT_SYMBOL_GPL(ovs_vport_ops_register);
+EXPORT_SYMBOL_GPL(__ovs_vport_ops_register);
void ovs_vport_ops_unregister(struct vport_ops *ops)
{
@@ -256,8 +256,8 @@ int ovs_vport_set_options(struct vport *vport, struct nlattr *options)
*
* @vport: vport to delete.
*
- * Detaches @vport from its datapath and destroys it. It is possible to fail
- * for reasons such as lack of memory. ovs_mutex must be held.
+ * Detaches @vport from its datapath and destroys it. ovs_mutex must
+ * be held.
*/
void ovs_vport_del(struct vport *vport)
{
diff --git a/net/openvswitch/vport.h b/net/openvswitch/vport.h
index bdfd82a7c064..f01f28a567ad 100644
--- a/net/openvswitch/vport.h
+++ b/net/openvswitch/vport.h
@@ -70,7 +70,7 @@ struct vport_portids {
/**
* struct vport - one port within a datapath
- * @rcu: RCU callback head for deferred destruction.
+ * @dev: Pointer to net_device.
* @dp: Datapath to which this port belongs.
* @upcall_portids: RCU protected 'struct vport_portids'.
* @port_no: Index into @dp's @ports array.
@@ -78,6 +78,7 @@ struct vport_portids {
* @dp_hash_node: Element in @datapath->ports hash table in datapath.c.
* @ops: Class structure.
* @detach_list: list used for detaching vport in net-exit call.
+ * @rcu: RCU callback head for deferred destruction.
*/
struct vport {
struct net_device *dev;
@@ -184,40 +185,19 @@ static inline struct vport *vport_from_priv(void *priv)
int ovs_vport_receive(struct vport *, struct sk_buff *,
const struct ip_tunnel_info *);
-static inline void ovs_skb_postpush_rcsum(struct sk_buff *skb,
- const void *start, unsigned int len)
-{
- if (skb->ip_summed == CHECKSUM_COMPLETE)
- skb->csum = csum_add(skb->csum, csum_partial(start, len, 0));
-}
-
static inline const char *ovs_vport_name(struct vport *vport)
{
return vport->dev->name;
}
-int ovs_vport_ops_register(struct vport_ops *ops);
-void ovs_vport_ops_unregister(struct vport_ops *ops);
-
-static inline struct rtable *ovs_tunnel_route_lookup(struct net *net,
- const struct ip_tunnel_key *key,
- u32 mark,
- struct flowi4 *fl,
- u8 protocol)
-{
- struct rtable *rt;
-
- memset(fl, 0, sizeof(*fl));
- fl->daddr = key->u.ipv4.dst;
- fl->saddr = key->u.ipv4.src;
- fl->flowi4_tos = RT_TOS(key->tos);
- fl->flowi4_mark = mark;
- fl->flowi4_proto = protocol;
-
- rt = ip_route_output_key(net, fl);
- return rt;
-}
+int __ovs_vport_ops_register(struct vport_ops *ops);
+#define ovs_vport_ops_register(ops) \
+ ({ \
+ (ops)->owner = THIS_MODULE; \
+ __ovs_vport_ops_register(ops); \
+ })
+void ovs_vport_ops_unregister(struct vport_ops *ops);
void ovs_vport_send(struct vport *vport, struct sk_buff *skb);
#endif /* vport.h */
diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index 691660b9b7ef..f12c17f355d9 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -557,9 +557,8 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po,
{
struct net_device *dev;
unsigned int mbits = 0, msec = 0, div = 0, tmo = 0;
- struct ethtool_cmd ecmd;
+ struct ethtool_link_ksettings ecmd;
int err;
- u32 speed;
rtnl_lock();
dev = __dev_get_by_index(sock_net(&po->sk), po->ifindex);
@@ -567,19 +566,19 @@ static int prb_calc_retire_blk_tmo(struct packet_sock *po,
rtnl_unlock();
return DEFAULT_PRB_RETIRE_TOV;
}
- err = __ethtool_get_settings(dev, &ecmd);
- speed = ethtool_cmd_speed(&ecmd);
+ err = __ethtool_get_link_ksettings(dev, &ecmd);
rtnl_unlock();
if (!err) {
/*
* If the link speed is so slow you don't really
* need to worry about perf anyways
*/
- if (speed < SPEED_1000 || speed == SPEED_UNKNOWN) {
+ if (ecmd.base.speed < SPEED_1000 ||
+ ecmd.base.speed == SPEED_UNKNOWN) {
return DEFAULT_PRB_RETIRE_TOV;
} else {
msec = 1;
- div = speed / 1000;
+ div = ecmd.base.speed / 1000;
}
}
@@ -1741,6 +1740,20 @@ static void fanout_release(struct sock *sk)
kfree_rcu(po->rollover, rcu);
}
+static bool packet_extra_vlan_len_allowed(const struct net_device *dev,
+ struct sk_buff *skb)
+{
+ /* Earlier code assumed this would be a VLAN pkt, double-check
+ * this now that we have the actual packet in hand. We can only
+ * do this check on Ethernet devices.
+ */
+ if (unlikely(dev->type != ARPHRD_ETHER))
+ return false;
+
+ skb_reset_mac_header(skb);
+ return likely(eth_hdr(skb)->h_proto == htons(ETH_P_8021Q));
+}
+
static const struct proto_ops packet_ops;
static const struct proto_ops packet_ops_spkt;
@@ -1902,18 +1915,14 @@ retry:
goto retry;
}
- if (len > (dev->mtu + dev->hard_header_len + extra_len)) {
- /* Earlier code assumed this would be a VLAN pkt,
- * double-check this now that we have the actual
- * packet in hand.
- */
- struct ethhdr *ehdr;
- skb_reset_mac_header(skb);
- ehdr = eth_hdr(skb);
- if (ehdr->h_proto != htons(ETH_P_8021Q)) {
- err = -EMSGSIZE;
- goto out_unlock;
- }
+ if (!dev_validate_header(dev, skb->data, len)) {
+ err = -EINVAL;
+ goto out_unlock;
+ }
+ if (len > (dev->mtu + dev->hard_header_len + extra_len) &&
+ !packet_extra_vlan_len_allowed(dev, skb)) {
+ err = -EMSGSIZE;
+ goto out_unlock;
}
skb->protocol = proto;
@@ -1954,6 +1963,64 @@ static unsigned int run_filter(struct sk_buff *skb,
return res;
}
+static int __packet_rcv_vnet(const struct sk_buff *skb,
+ struct virtio_net_hdr *vnet_hdr)
+{
+ *vnet_hdr = (const struct virtio_net_hdr) { 0 };
+
+ if (skb_is_gso(skb)) {
+ struct skb_shared_info *sinfo = skb_shinfo(skb);
+
+ /* This is a hint as to how much should be linear. */
+ vnet_hdr->hdr_len =
+ __cpu_to_virtio16(vio_le(), skb_headlen(skb));
+ vnet_hdr->gso_size =
+ __cpu_to_virtio16(vio_le(), sinfo->gso_size);
+
+ if (sinfo->gso_type & SKB_GSO_TCPV4)
+ vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
+ else if (sinfo->gso_type & SKB_GSO_TCPV6)
+ vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
+ else if (sinfo->gso_type & SKB_GSO_UDP)
+ vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_UDP;
+ else if (sinfo->gso_type & SKB_GSO_FCOE)
+ return -EINVAL;
+ else
+ BUG();
+
+ if (sinfo->gso_type & SKB_GSO_TCP_ECN)
+ vnet_hdr->gso_type |= VIRTIO_NET_HDR_GSO_ECN;
+ } else
+ vnet_hdr->gso_type = VIRTIO_NET_HDR_GSO_NONE;
+
+ if (skb->ip_summed == CHECKSUM_PARTIAL) {
+ vnet_hdr->flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
+ vnet_hdr->csum_start = __cpu_to_virtio16(vio_le(),
+ skb_checksum_start_offset(skb));
+ vnet_hdr->csum_offset = __cpu_to_virtio16(vio_le(),
+ skb->csum_offset);
+ } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
+ vnet_hdr->flags = VIRTIO_NET_HDR_F_DATA_VALID;
+ } /* else everything is zero */
+
+ return 0;
+}
+
+static int packet_rcv_vnet(struct msghdr *msg, const struct sk_buff *skb,
+ size_t *len)
+{
+ struct virtio_net_hdr vnet_hdr;
+
+ if (*len < sizeof(vnet_hdr))
+ return -EINVAL;
+ *len -= sizeof(vnet_hdr);
+
+ if (__packet_rcv_vnet(skb, &vnet_hdr))
+ return -EINVAL;
+
+ return memcpy_to_msg(msg, (void *)&vnet_hdr, sizeof(vnet_hdr));
+}
+
/*
* This function makes lazy skb cloning in hope that most of packets
* are discarded by BPF.
@@ -2142,7 +2209,9 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
unsigned int maclen = skb_network_offset(skb);
netoff = TPACKET_ALIGN(po->tp_hdrlen +
(maclen < 16 ? 16 : maclen)) +
- po->tp_reserve;
+ po->tp_reserve;
+ if (po->has_vnet_hdr)
+ netoff += sizeof(struct virtio_net_hdr);
macoff = netoff - maclen;
}
if (po->tp_version <= TPACKET_V2) {
@@ -2179,7 +2248,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
h.raw = packet_current_rx_frame(po, skb,
TP_STATUS_KERNEL, (macoff+snaplen));
if (!h.raw)
- goto ring_is_full;
+ goto drop_n_account;
if (po->tp_version <= TPACKET_V2) {
packet_increment_rx_head(po, &po->rx_ring);
/*
@@ -2198,6 +2267,14 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
}
spin_unlock(&sk->sk_receive_queue.lock);
+ if (po->has_vnet_hdr) {
+ if (__packet_rcv_vnet(skb, h.raw + macoff -
+ sizeof(struct virtio_net_hdr))) {
+ spin_lock(&sk->sk_receive_queue.lock);
+ goto drop_n_account;
+ }
+ }
+
skb_copy_bits(skb, 0, h.raw + macoff, snaplen);
if (!(ts_status = tpacket_get_timestamp(skb, &ts, po->tp_tstamp)))
@@ -2293,7 +2370,7 @@ drop:
kfree_skb(skb);
return 0;
-ring_is_full:
+drop_n_account:
po->stats.stats1.tp_drops++;
spin_unlock(&sk->sk_receive_queue.lock);
@@ -2320,27 +2397,101 @@ static void tpacket_destruct_skb(struct sk_buff *skb)
sock_wfree(skb);
}
-static bool ll_header_truncated(const struct net_device *dev, int len)
+static void tpacket_set_protocol(const struct net_device *dev,
+ struct sk_buff *skb)
+{
+ if (dev->type == ARPHRD_ETHER) {
+ skb_reset_mac_header(skb);
+ skb->protocol = eth_hdr(skb)->h_proto;
+ }
+}
+
+static int __packet_snd_vnet_parse(struct virtio_net_hdr *vnet_hdr, size_t len)
{
- /* net device doesn't like empty head */
- if (unlikely(len <= dev->hard_header_len)) {
- net_warn_ratelimited("%s: packet size is too short (%d <= %d)\n",
- current->comm, len, dev->hard_header_len);
- return true;
+ unsigned short gso_type = 0;
+
+ if ((vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
+ (__virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
+ __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2 >
+ __virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len)))
+ vnet_hdr->hdr_len = __cpu_to_virtio16(vio_le(),
+ __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start) +
+ __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset) + 2);
+
+ if (__virtio16_to_cpu(vio_le(), vnet_hdr->hdr_len) > len)
+ return -EINVAL;
+
+ if (vnet_hdr->gso_type != VIRTIO_NET_HDR_GSO_NONE) {
+ switch (vnet_hdr->gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
+ case VIRTIO_NET_HDR_GSO_TCPV4:
+ gso_type = SKB_GSO_TCPV4;
+ break;
+ case VIRTIO_NET_HDR_GSO_TCPV6:
+ gso_type = SKB_GSO_TCPV6;
+ break;
+ case VIRTIO_NET_HDR_GSO_UDP:
+ gso_type = SKB_GSO_UDP;
+ break;
+ default:
+ return -EINVAL;
+ }
+
+ if (vnet_hdr->gso_type & VIRTIO_NET_HDR_GSO_ECN)
+ gso_type |= SKB_GSO_TCP_ECN;
+
+ if (vnet_hdr->gso_size == 0)
+ return -EINVAL;
}
- return false;
+ vnet_hdr->gso_type = gso_type; /* changes type, temporary storage */
+ return 0;
+}
+
+static int packet_snd_vnet_parse(struct msghdr *msg, size_t *len,
+ struct virtio_net_hdr *vnet_hdr)
+{
+ int n;
+
+ if (*len < sizeof(*vnet_hdr))
+ return -EINVAL;
+ *len -= sizeof(*vnet_hdr);
+
+ n = copy_from_iter(vnet_hdr, sizeof(*vnet_hdr), &msg->msg_iter);
+ if (n != sizeof(*vnet_hdr))
+ return -EFAULT;
+
+ return __packet_snd_vnet_parse(vnet_hdr, *len);
+}
+
+static int packet_snd_vnet_gso(struct sk_buff *skb,
+ struct virtio_net_hdr *vnet_hdr)
+{
+ if (vnet_hdr->flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
+ u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_start);
+ u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr->csum_offset);
+
+ if (!skb_partial_csum_set(skb, s, o))
+ return -EINVAL;
+ }
+
+ skb_shinfo(skb)->gso_size =
+ __virtio16_to_cpu(vio_le(), vnet_hdr->gso_size);
+ skb_shinfo(skb)->gso_type = vnet_hdr->gso_type;
+
+ /* Header must be checked, and gso_segs computed. */
+ skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
+ skb_shinfo(skb)->gso_segs = 0;
+ return 0;
}
static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
- void *frame, struct net_device *dev, int size_max,
- __be16 proto, unsigned char *addr, int hlen)
+ void *frame, struct net_device *dev, void *data, int tp_len,
+ __be16 proto, unsigned char *addr, int hlen, int copylen)
{
union tpacket_uhdr ph;
- int to_write, offset, len, tp_len, nr_frags, len_max;
+ int to_write, offset, len, nr_frags, len_max;
struct socket *sock = po->sk.sk_socket;
struct page *page;
- void *data;
int err;
ph.raw = frame;
@@ -2352,53 +2503,9 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
sock_tx_timestamp(&po->sk, &skb_shinfo(skb)->tx_flags);
skb_shinfo(skb)->destructor_arg = ph.raw;
- switch (po->tp_version) {
- case TPACKET_V2:
- tp_len = ph.h2->tp_len;
- break;
- default:
- tp_len = ph.h1->tp_len;
- break;
- }
- if (unlikely(tp_len > size_max)) {
- pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
- return -EMSGSIZE;
- }
-
skb_reserve(skb, hlen);
skb_reset_network_header(skb);
- if (!packet_use_direct_xmit(po))
- skb_probe_transport_header(skb, 0);
- if (unlikely(po->tp_tx_has_off)) {
- int off_min, off_max, off;
- off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
- off_max = po->tx_ring.frame_size - tp_len;
- if (sock->type == SOCK_DGRAM) {
- switch (po->tp_version) {
- case TPACKET_V2:
- off = ph.h2->tp_net;
- break;
- default:
- off = ph.h1->tp_net;
- break;
- }
- } else {
- switch (po->tp_version) {
- case TPACKET_V2:
- off = ph.h2->tp_mac;
- break;
- default:
- off = ph.h1->tp_mac;
- break;
- }
- }
- if (unlikely((off < off_min) || (off_max < off)))
- return -EINVAL;
- data = ph.raw + off;
- } else {
- data = ph.raw + po->tp_hdrlen - sizeof(struct sockaddr_ll);
- }
to_write = tp_len;
if (sock->type == SOCK_DGRAM) {
@@ -2406,18 +2513,21 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
NULL, tp_len);
if (unlikely(err < 0))
return -EINVAL;
- } else if (dev->hard_header_len) {
- if (ll_header_truncated(dev, tp_len))
- return -EINVAL;
+ } else if (copylen) {
+ int hdrlen = min_t(int, copylen, tp_len);
skb_push(skb, dev->hard_header_len);
- err = skb_store_bits(skb, 0, data,
- dev->hard_header_len);
+ skb_put(skb, copylen - dev->hard_header_len);
+ err = skb_store_bits(skb, 0, data, hdrlen);
if (unlikely(err))
return err;
+ if (!dev_validate_header(dev, skb->data, hdrlen))
+ return -EINVAL;
+ if (!skb->protocol)
+ tpacket_set_protocol(dev, skb);
- data += dev->hard_header_len;
- to_write -= dev->hard_header_len;
+ data += hdrlen;
+ to_write -= hdrlen;
}
offset = offset_in_page(data);
@@ -2449,6 +2559,63 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb,
len = ((to_write > len_max) ? len_max : to_write);
}
+ skb_probe_transport_header(skb, 0);
+
+ return tp_len;
+}
+
+static int tpacket_parse_header(struct packet_sock *po, void *frame,
+ int size_max, void **data)
+{
+ union tpacket_uhdr ph;
+ int tp_len, off;
+
+ ph.raw = frame;
+
+ switch (po->tp_version) {
+ case TPACKET_V2:
+ tp_len = ph.h2->tp_len;
+ break;
+ default:
+ tp_len = ph.h1->tp_len;
+ break;
+ }
+ if (unlikely(tp_len > size_max)) {
+ pr_err("packet size is too long (%d > %d)\n", tp_len, size_max);
+ return -EMSGSIZE;
+ }
+
+ if (unlikely(po->tp_tx_has_off)) {
+ int off_min, off_max;
+
+ off_min = po->tp_hdrlen - sizeof(struct sockaddr_ll);
+ off_max = po->tx_ring.frame_size - tp_len;
+ if (po->sk.sk_type == SOCK_DGRAM) {
+ switch (po->tp_version) {
+ case TPACKET_V2:
+ off = ph.h2->tp_net;
+ break;
+ default:
+ off = ph.h1->tp_net;
+ break;
+ }
+ } else {
+ switch (po->tp_version) {
+ case TPACKET_V2:
+ off = ph.h2->tp_mac;
+ break;
+ default:
+ off = ph.h1->tp_mac;
+ break;
+ }
+ }
+ if (unlikely((off < off_min) || (off_max < off)))
+ return -EINVAL;
+ } else {
+ off = po->tp_hdrlen - sizeof(struct sockaddr_ll);
+ }
+
+ *data = frame + off;
return tp_len;
}
@@ -2456,6 +2623,7 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
{
struct sk_buff *skb;
struct net_device *dev;
+ struct virtio_net_hdr *vnet_hdr = NULL;
__be16 proto;
int err, reserve = 0;
void *ph;
@@ -2463,9 +2631,10 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
bool need_wait = !(msg->msg_flags & MSG_DONTWAIT);
int tp_len, size_max;
unsigned char *addr;
+ void *data;
int len_sum = 0;
int status = TP_STATUS_AVAILABLE;
- int hlen, tlen;
+ int hlen, tlen, copylen = 0;
mutex_lock(&po->pg_vec_lock);
@@ -2493,12 +2662,13 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
if (unlikely(!(dev->flags & IFF_UP)))
goto out_put;
- reserve = dev->hard_header_len + VLAN_HLEN;
+ if (po->sk.sk_socket->type == SOCK_RAW)
+ reserve = dev->hard_header_len;
size_max = po->tx_ring.frame_size
- (po->tp_hdrlen - sizeof(struct sockaddr_ll));
- if (size_max > dev->mtu + reserve)
- size_max = dev->mtu + reserve;
+ if ((size_max > dev->mtu + reserve + VLAN_HLEN) && !po->has_vnet_hdr)
+ size_max = dev->mtu + reserve + VLAN_HLEN;
do {
ph = packet_current_frame(po, &po->tx_ring,
@@ -2509,11 +2679,30 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
continue;
}
+ skb = NULL;
+ tp_len = tpacket_parse_header(po, ph, size_max, &data);
+ if (tp_len < 0)
+ goto tpacket_error;
+
status = TP_STATUS_SEND_REQUEST;
hlen = LL_RESERVED_SPACE(dev);
tlen = dev->needed_tailroom;
+ if (po->has_vnet_hdr) {
+ vnet_hdr = data;
+ data += sizeof(*vnet_hdr);
+ tp_len -= sizeof(*vnet_hdr);
+ if (tp_len < 0 ||
+ __packet_snd_vnet_parse(vnet_hdr, tp_len)) {
+ tp_len = -EINVAL;
+ goto tpacket_error;
+ }
+ copylen = __virtio16_to_cpu(vio_le(),
+ vnet_hdr->hdr_len);
+ }
+ copylen = max_t(int, copylen, dev->hard_header_len);
skb = sock_alloc_send_skb(&po->sk,
- hlen + tlen + sizeof(struct sockaddr_ll),
+ hlen + tlen + sizeof(struct sockaddr_ll) +
+ (copylen - dev->hard_header_len),
!need_wait, &err);
if (unlikely(skb == NULL)) {
@@ -2522,22 +2711,16 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
err = len_sum;
goto out_status;
}
- tp_len = tpacket_fill_skb(po, skb, ph, dev, size_max, proto,
- addr, hlen);
+ tp_len = tpacket_fill_skb(po, skb, ph, dev, data, tp_len, proto,
+ addr, hlen, copylen);
if (likely(tp_len >= 0) &&
- tp_len > dev->mtu + dev->hard_header_len) {
- struct ethhdr *ehdr;
- /* Earlier code assumed this would be a VLAN pkt,
- * double-check this now that we have the actual
- * packet in hand.
- */
+ tp_len > dev->mtu + reserve &&
+ !po->has_vnet_hdr &&
+ !packet_extra_vlan_len_allowed(dev, skb))
+ tp_len = -EMSGSIZE;
- skb_reset_mac_header(skb);
- ehdr = eth_hdr(skb);
- if (ehdr->h_proto != htons(ETH_P_8021Q))
- tp_len = -EMSGSIZE;
- }
if (unlikely(tp_len < 0)) {
+tpacket_error:
if (po->tp_loss) {
__packet_set_status(po, ph,
TP_STATUS_AVAILABLE);
@@ -2551,6 +2734,11 @@ static int tpacket_snd(struct packet_sock *po, struct msghdr *msg)
}
}
+ if (po->has_vnet_hdr && packet_snd_vnet_gso(skb, vnet_hdr)) {
+ tp_len = -EINVAL;
+ goto tpacket_error;
+ }
+
packet_pick_tx_queue(dev, skb);
skb->destructor = tpacket_destruct_skb;
@@ -2633,12 +2821,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
struct sockcm_cookie sockc;
struct virtio_net_hdr vnet_hdr = { 0 };
int offset = 0;
- int vnet_hdr_len;
struct packet_sock *po = pkt_sk(sk);
- unsigned short gso_type = 0;
int hlen, tlen;
int extra_len = 0;
- ssize_t n;
/*
* Get and verify the address.
@@ -2676,53 +2861,9 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
if (sock->type == SOCK_RAW)
reserve = dev->hard_header_len;
if (po->has_vnet_hdr) {
- vnet_hdr_len = sizeof(vnet_hdr);
-
- err = -EINVAL;
- if (len < vnet_hdr_len)
- goto out_unlock;
-
- len -= vnet_hdr_len;
-
- err = -EFAULT;
- n = copy_from_iter(&vnet_hdr, vnet_hdr_len, &msg->msg_iter);
- if (n != vnet_hdr_len)
- goto out_unlock;
-
- if ((vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) &&
- (__virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) +
- __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2 >
- __virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len)))
- vnet_hdr.hdr_len = __cpu_to_virtio16(vio_le(),
- __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start) +
- __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset) + 2);
-
- err = -EINVAL;
- if (__virtio16_to_cpu(vio_le(), vnet_hdr.hdr_len) > len)
+ err = packet_snd_vnet_parse(msg, &len, &vnet_hdr);
+ if (err)
goto out_unlock;
-
- if (vnet_hdr.gso_type != VIRTIO_NET_HDR_GSO_NONE) {
- switch (vnet_hdr.gso_type & ~VIRTIO_NET_HDR_GSO_ECN) {
- case VIRTIO_NET_HDR_GSO_TCPV4:
- gso_type = SKB_GSO_TCPV4;
- break;
- case VIRTIO_NET_HDR_GSO_TCPV6:
- gso_type = SKB_GSO_TCPV6;
- break;
- case VIRTIO_NET_HDR_GSO_UDP:
- gso_type = SKB_GSO_UDP;
- break;
- default:
- goto out_unlock;
- }
-
- if (vnet_hdr.gso_type & VIRTIO_NET_HDR_GSO_ECN)
- gso_type |= SKB_GSO_TCP_ECN;
-
- if (vnet_hdr.gso_size == 0)
- goto out_unlock;
-
- }
}
if (unlikely(sock_flag(sk, SOCK_NOFCS))) {
@@ -2734,7 +2875,8 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
}
err = -EMSGSIZE;
- if (!gso_type && (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
+ if (!vnet_hdr.gso_type &&
+ (len > dev->mtu + reserve + VLAN_HLEN + extra_len))
goto out_unlock;
err = -ENOBUFS;
@@ -2753,9 +2895,6 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
offset = dev_hard_header(skb, dev, ntohs(proto), addr, NULL, len);
if (unlikely(offset < 0))
goto out_free;
- } else {
- if (ll_header_truncated(dev, len))
- goto out_free;
}
/* Returns -EFAULT on error */
@@ -2763,20 +2902,18 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
if (err)
goto out_free;
+ if (sock->type == SOCK_RAW &&
+ !dev_validate_header(dev, skb->data, len)) {
+ err = -EINVAL;
+ goto out_free;
+ }
+
sock_tx_timestamp(sk, &skb_shinfo(skb)->tx_flags);
- if (!gso_type && (len > dev->mtu + reserve + extra_len)) {
- /* Earlier code assumed this would be a VLAN pkt,
- * double-check this now that we have the actual
- * packet in hand.
- */
- struct ethhdr *ehdr;
- skb_reset_mac_header(skb);
- ehdr = eth_hdr(skb);
- if (ehdr->h_proto != htons(ETH_P_8021Q)) {
- err = -EMSGSIZE;
- goto out_free;
- }
+ if (!vnet_hdr.gso_type && (len > dev->mtu + reserve + extra_len) &&
+ !packet_extra_vlan_len_allowed(dev, skb)) {
+ err = -EMSGSIZE;
+ goto out_free;
}
skb->protocol = proto;
@@ -2787,28 +2924,14 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len)
packet_pick_tx_queue(dev, skb);
if (po->has_vnet_hdr) {
- if (vnet_hdr.flags & VIRTIO_NET_HDR_F_NEEDS_CSUM) {
- u16 s = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_start);
- u16 o = __virtio16_to_cpu(vio_le(), vnet_hdr.csum_offset);
- if (!skb_partial_csum_set(skb, s, o)) {
- err = -EINVAL;
- goto out_free;
- }
- }
-
- skb_shinfo(skb)->gso_size =
- __virtio16_to_cpu(vio_le(), vnet_hdr.gso_size);
- skb_shinfo(skb)->gso_type = gso_type;
-
- /* Header must be checked, and gso_segs computed. */
- skb_shinfo(skb)->gso_type |= SKB_GSO_DODGY;
- skb_shinfo(skb)->gso_segs = 0;
-
- len += vnet_hdr_len;
+ err = packet_snd_vnet_gso(skb, &vnet_hdr);
+ if (err)
+ goto out_free;
+ len += sizeof(vnet_hdr);
}
- if (!packet_use_direct_xmit(po))
- skb_probe_transport_header(skb, reserve);
+ skb_probe_transport_header(skb, reserve);
+
if (unlikely(extra_len == 4))
skb->no_fcs = 1;
@@ -2911,22 +3034,40 @@ static int packet_release(struct socket *sock)
* Attach a packet hook.
*/
-static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
+static int packet_do_bind(struct sock *sk, const char *name, int ifindex,
+ __be16 proto)
{
struct packet_sock *po = pkt_sk(sk);
struct net_device *dev_curr;
__be16 proto_curr;
bool need_rehook;
+ struct net_device *dev = NULL;
+ int ret = 0;
+ bool unlisted = false;
- if (po->fanout) {
- if (dev)
- dev_put(dev);
-
+ if (po->fanout)
return -EINVAL;
- }
lock_sock(sk);
spin_lock(&po->bind_lock);
+ rcu_read_lock();
+
+ if (name) {
+ dev = dev_get_by_name_rcu(sock_net(sk), name);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+ } else if (ifindex) {
+ dev = dev_get_by_index_rcu(sock_net(sk), ifindex);
+ if (!dev) {
+ ret = -ENODEV;
+ goto out_unlock;
+ }
+ }
+
+ if (dev)
+ dev_hold(dev);
proto_curr = po->prot_hook.type;
dev_curr = po->prot_hook.dev;
@@ -2934,14 +3075,29 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
need_rehook = proto_curr != proto || dev_curr != dev;
if (need_rehook) {
- unregister_prot_hook(sk, true);
+ if (po->running) {
+ rcu_read_unlock();
+ __unregister_prot_hook(sk, true);
+ rcu_read_lock();
+ dev_curr = po->prot_hook.dev;
+ if (dev)
+ unlisted = !dev_get_by_index_rcu(sock_net(sk),
+ dev->ifindex);
+ }
po->num = proto;
po->prot_hook.type = proto;
- po->prot_hook.dev = dev;
- po->ifindex = dev ? dev->ifindex : 0;
- packet_cached_dev_assign(po, dev);
+ if (unlikely(unlisted)) {
+ dev_put(dev);
+ po->prot_hook.dev = NULL;
+ po->ifindex = -1;
+ packet_cached_dev_reset(po);
+ } else {
+ po->prot_hook.dev = dev;
+ po->ifindex = dev ? dev->ifindex : 0;
+ packet_cached_dev_assign(po, dev);
+ }
}
if (dev_curr)
dev_put(dev_curr);
@@ -2949,7 +3105,7 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
if (proto == 0 || !need_rehook)
goto out_unlock;
- if (!dev || (dev->flags & IFF_UP)) {
+ if (!unlisted && (!dev || (dev->flags & IFF_UP))) {
register_prot_hook(sk);
} else {
sk->sk_err = ENETDOWN;
@@ -2958,9 +3114,10 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto)
}
out_unlock:
+ rcu_read_unlock();
spin_unlock(&po->bind_lock);
release_sock(sk);
- return 0;
+ return ret;
}
/*
@@ -2972,8 +3129,6 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
{
struct sock *sk = sock->sk;
char name[15];
- struct net_device *dev;
- int err = -ENODEV;
/*
* Check legality
@@ -2983,19 +3138,13 @@ static int packet_bind_spkt(struct socket *sock, struct sockaddr *uaddr,
return -EINVAL;
strlcpy(name, uaddr->sa_data, sizeof(name));
- dev = dev_get_by_name(sock_net(sk), name);
- if (dev)
- err = packet_do_bind(sk, dev, pkt_sk(sk)->num);
- return err;
+ return packet_do_bind(sk, name, 0, pkt_sk(sk)->num);
}
static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_ll *sll = (struct sockaddr_ll *)uaddr;
struct sock *sk = sock->sk;
- struct net_device *dev = NULL;
- int err;
-
/*
* Check legality
@@ -3006,16 +3155,8 @@ static int packet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len
if (sll->sll_family != AF_PACKET)
return -EINVAL;
- if (sll->sll_ifindex) {
- err = -ENODEV;
- dev = dev_get_by_index(sock_net(sk), sll->sll_ifindex);
- if (dev == NULL)
- goto out;
- }
- err = packet_do_bind(sk, dev, sll->sll_protocol ? : pkt_sk(sk)->num);
-
-out:
- return err;
+ return packet_do_bind(sk, NULL, sll->sll_ifindex,
+ sll->sll_protocol ? : pkt_sk(sk)->num);
}
static struct proto packet_proto = {
@@ -3157,51 +3298,10 @@ static int packet_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
packet_rcv_has_room(pkt_sk(sk), NULL);
if (pkt_sk(sk)->has_vnet_hdr) {
- struct virtio_net_hdr vnet_hdr = { 0 };
-
- err = -EINVAL;
- vnet_hdr_len = sizeof(vnet_hdr);
- if (len < vnet_hdr_len)
- goto out_free;
-
- len -= vnet_hdr_len;
-
- if (skb_is_gso(skb)) {
- struct skb_shared_info *sinfo = skb_shinfo(skb);
-
- /* This is a hint as to how much should be linear. */
- vnet_hdr.hdr_len =
- __cpu_to_virtio16(vio_le(), skb_headlen(skb));
- vnet_hdr.gso_size =
- __cpu_to_virtio16(vio_le(), sinfo->gso_size);
- if (sinfo->gso_type & SKB_GSO_TCPV4)
- vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV4;
- else if (sinfo->gso_type & SKB_GSO_TCPV6)
- vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_TCPV6;
- else if (sinfo->gso_type & SKB_GSO_UDP)
- vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_UDP;
- else if (sinfo->gso_type & SKB_GSO_FCOE)
- goto out_free;
- else
- BUG();
- if (sinfo->gso_type & SKB_GSO_TCP_ECN)
- vnet_hdr.gso_type |= VIRTIO_NET_HDR_GSO_ECN;
- } else
- vnet_hdr.gso_type = VIRTIO_NET_HDR_GSO_NONE;
-
- if (skb->ip_summed == CHECKSUM_PARTIAL) {
- vnet_hdr.flags = VIRTIO_NET_HDR_F_NEEDS_CSUM;
- vnet_hdr.csum_start = __cpu_to_virtio16(vio_le(),
- skb_checksum_start_offset(skb));
- vnet_hdr.csum_offset = __cpu_to_virtio16(vio_le(),
- skb->csum_offset);
- } else if (skb->ip_summed == CHECKSUM_UNNECESSARY) {
- vnet_hdr.flags = VIRTIO_NET_HDR_F_DATA_VALID;
- } /* else everything is zero */
-
- err = memcpy_to_msg(msg, (void *)&vnet_hdr, vnet_hdr_len);
- if (err < 0)
+ err = packet_rcv_vnet(msg, skb, &len);
+ if (err)
goto out_free;
+ vnet_hdr_len = sizeof(struct virtio_net_hdr);
}
/* You lose any data beyond the buffer you gave. If it worries
@@ -3532,8 +3632,6 @@ packet_setsockopt(struct socket *sock, int level, int optname, char __user *optv
}
if (optlen < len)
return -EINVAL;
- if (pkt_sk(sk)->has_vnet_hdr)
- return -EINVAL;
if (copy_from_user(&req_u.req, optval, len))
return -EFAULT;
return packet_set_ring(sk, &req_u, 0,
@@ -4053,7 +4151,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
/* Opening a Tx-ring is NOT supported in TPACKET_V3 */
if (!closing && tx_ring && (po->tp_version > TPACKET_V2)) {
- WARN(1, "Tx-ring is not supported.\n");
+ net_warn_ratelimited("Tx-ring is not supported.\n");
goto out;
}
@@ -4089,7 +4187,7 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
err = -EINVAL;
if (unlikely((int)req->tp_block_size <= 0))
goto out;
- if (unlikely(req->tp_block_size & (PAGE_SIZE - 1)))
+ if (unlikely(!PAGE_ALIGNED(req->tp_block_size)))
goto out;
if (po->tp_version >= TPACKET_V3 &&
(int)(req->tp_block_size -
@@ -4101,8 +4199,8 @@ static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
if (unlikely(req->tp_frame_size & (TPACKET_ALIGNMENT - 1)))
goto out;
- rb->frames_per_block = req->tp_block_size/req->tp_frame_size;
- if (unlikely(rb->frames_per_block <= 0))
+ rb->frames_per_block = req->tp_block_size / req->tp_frame_size;
+ if (unlikely(rb->frames_per_block == 0))
goto out;
if (unlikely((rb->frames_per_block * req->tp_block_nr) !=
req->tp_frame_nr))
diff --git a/net/phonet/af_phonet.c b/net/phonet/af_phonet.c
index 10d42f3220ab..f925753668a7 100644
--- a/net/phonet/af_phonet.c
+++ b/net/phonet/af_phonet.c
@@ -377,6 +377,10 @@ static int phonet_rcv(struct sk_buff *skb, struct net_device *dev,
struct sockaddr_pn sa;
u16 len;
+ skb = skb_share_check(skb, GFP_ATOMIC);
+ if (!skb)
+ return NET_RX_DROP;
+
/* check we have at least a full Phonet header */
if (!pskb_pull(skb, sizeof(struct phonethdr)))
goto out;
diff --git a/net/phonet/socket.c b/net/phonet/socket.c
index d575ef4e9aa6..ffd5f2297584 100644
--- a/net/phonet/socket.c
+++ b/net/phonet/socket.c
@@ -140,13 +140,15 @@ void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb)
rcu_read_unlock();
}
-void pn_sock_hash(struct sock *sk)
+int pn_sock_hash(struct sock *sk)
{
struct hlist_head *hlist = pn_hash_list(pn_sk(sk)->sobject);
mutex_lock(&pnsocks.lock);
sk_add_node_rcu(sk, hlist);
mutex_unlock(&pnsocks.lock);
+
+ return 0;
}
EXPORT_SYMBOL(pn_sock_hash);
@@ -200,7 +202,7 @@ static int pn_socket_bind(struct socket *sock, struct sockaddr *addr, int len)
pn->resource = spn->spn_resource;
/* Enable RX on the socket */
- sk->sk_prot->hash(sk);
+ err = sk->sk_prot->hash(sk);
out_port:
mutex_unlock(&port_mutex);
out:
diff --git a/net/rds/Kconfig b/net/rds/Kconfig
index f2c670ba7b9b..bffde4b46c5d 100644
--- a/net/rds/Kconfig
+++ b/net/rds/Kconfig
@@ -4,14 +4,13 @@ config RDS
depends on INET
---help---
The RDS (Reliable Datagram Sockets) protocol provides reliable,
- sequenced delivery of datagrams over Infiniband, iWARP,
- or TCP.
+ sequenced delivery of datagrams over Infiniband or TCP.
config RDS_RDMA
- tristate "RDS over Infiniband and iWARP"
+ tristate "RDS over Infiniband"
depends on RDS && INFINIBAND && INFINIBAND_ADDR_TRANS
---help---
- Allow RDS to use Infiniband and iWARP as a transport.
+ Allow RDS to use Infiniband as a transport.
This transport supports RDMA operations.
config RDS_TCP
diff --git a/net/rds/Makefile b/net/rds/Makefile
index 56d3f6023ced..0e72bec1529f 100644
--- a/net/rds/Makefile
+++ b/net/rds/Makefile
@@ -6,9 +6,7 @@ rds-y := af_rds.o bind.o cong.o connection.o info.o message.o \
obj-$(CONFIG_RDS_RDMA) += rds_rdma.o
rds_rdma-y := rdma_transport.o \
ib.o ib_cm.o ib_recv.o ib_ring.o ib_send.o ib_stats.o \
- ib_sysctl.o ib_rdma.o \
- iw.o iw_cm.o iw_recv.o iw_ring.o iw_send.o iw_stats.o \
- iw_sysctl.o iw_rdma.o
+ ib_sysctl.o ib_rdma.o ib_fmr.o ib_frmr.o
obj-$(CONFIG_RDS_TCP) += rds_tcp.o
diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
index b5476aebd68d..6beaeb1138f3 100644
--- a/net/rds/af_rds.c
+++ b/net/rds/af_rds.c
@@ -277,6 +277,27 @@ static int rds_set_transport(struct rds_sock *rs, char __user *optval,
return rs->rs_transport ? 0 : -ENOPROTOOPT;
}
+static int rds_enable_recvtstamp(struct sock *sk, char __user *optval,
+ int optlen)
+{
+ int val, valbool;
+
+ if (optlen != sizeof(int))
+ return -EFAULT;
+
+ if (get_user(val, (int __user *)optval))
+ return -EFAULT;
+
+ valbool = val ? 1 : 0;
+
+ if (valbool)
+ sock_set_flag(sk, SOCK_RCVTSTAMP);
+ else
+ sock_reset_flag(sk, SOCK_RCVTSTAMP);
+
+ return 0;
+}
+
static int rds_setsockopt(struct socket *sock, int level, int optname,
char __user *optval, unsigned int optlen)
{
@@ -312,6 +333,11 @@ static int rds_setsockopt(struct socket *sock, int level, int optname,
ret = rds_set_transport(rs, optval, optlen);
release_sock(sock->sk);
break;
+ case SO_TIMESTAMP:
+ lock_sock(sock->sk);
+ ret = rds_enable_recvtstamp(sock->sk, optval, optlen);
+ release_sock(sock->sk);
+ break;
default:
ret = -ENOPROTOOPT;
}
diff --git a/net/rds/connection.c b/net/rds/connection.c
index d4564036a339..e3b118cae81d 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -186,12 +186,6 @@ static struct rds_connection *__rds_conn_create(struct net *net,
}
}
- if (trans == NULL) {
- kmem_cache_free(rds_conn_slab, conn);
- conn = ERR_PTR(-ENODEV);
- goto out;
- }
-
conn->c_trans = trans;
ret = trans->conn_alloc(conn, gfp);
diff --git a/net/rds/ib.c b/net/rds/ib.c
index a833ab7898fe..b5342fddaf98 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -42,15 +42,16 @@
#include "rds.h"
#include "ib.h"
+#include "ib_mr.h"
-unsigned int rds_ib_fmr_1m_pool_size = RDS_FMR_1M_POOL_SIZE;
-unsigned int rds_ib_fmr_8k_pool_size = RDS_FMR_8K_POOL_SIZE;
+unsigned int rds_ib_mr_1m_pool_size = RDS_MR_1M_POOL_SIZE;
+unsigned int rds_ib_mr_8k_pool_size = RDS_MR_8K_POOL_SIZE;
unsigned int rds_ib_retry_count = RDS_IB_DEFAULT_RETRY_COUNT;
-module_param(rds_ib_fmr_1m_pool_size, int, 0444);
-MODULE_PARM_DESC(rds_ib_fmr_1m_pool_size, " Max number of 1M fmr per HCA");
-module_param(rds_ib_fmr_8k_pool_size, int, 0444);
-MODULE_PARM_DESC(rds_ib_fmr_8k_pool_size, " Max number of 8K fmr per HCA");
+module_param(rds_ib_mr_1m_pool_size, int, 0444);
+MODULE_PARM_DESC(rds_ib_mr_1m_pool_size, " Max number of 1M mr per HCA");
+module_param(rds_ib_mr_8k_pool_size, int, 0444);
+MODULE_PARM_DESC(rds_ib_mr_8k_pool_size, " Max number of 8K mr per HCA");
module_param(rds_ib_retry_count, int, 0444);
MODULE_PARM_DESC(rds_ib_retry_count, " Number of hw retries before reporting an error");
@@ -122,44 +123,40 @@ void rds_ib_dev_put(struct rds_ib_device *rds_ibdev)
static void rds_ib_add_one(struct ib_device *device)
{
struct rds_ib_device *rds_ibdev;
- struct ib_device_attr *dev_attr;
/* Only handle IB (no iWARP) devices */
if (device->node_type != RDMA_NODE_IB_CA)
return;
- dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
- if (!dev_attr)
- return;
-
- if (ib_query_device(device, dev_attr)) {
- rdsdebug("Query device failed for %s\n", device->name);
- goto free_attr;
- }
-
rds_ibdev = kzalloc_node(sizeof(struct rds_ib_device), GFP_KERNEL,
ibdev_to_node(device));
if (!rds_ibdev)
- goto free_attr;
+ return;
spin_lock_init(&rds_ibdev->spinlock);
atomic_set(&rds_ibdev->refcount, 1);
INIT_WORK(&rds_ibdev->free_work, rds_ib_dev_free);
- rds_ibdev->max_wrs = dev_attr->max_qp_wr;
- rds_ibdev->max_sge = min(dev_attr->max_sge, RDS_IB_MAX_SGE);
+ rds_ibdev->max_wrs = device->attrs.max_qp_wr;
+ rds_ibdev->max_sge = min(device->attrs.max_sge, RDS_IB_MAX_SGE);
+
+ rds_ibdev->has_fr = (device->attrs.device_cap_flags &
+ IB_DEVICE_MEM_MGT_EXTENSIONS);
+ rds_ibdev->has_fmr = (device->alloc_fmr && device->dealloc_fmr &&
+ device->map_phys_fmr && device->unmap_fmr);
+ rds_ibdev->use_fastreg = (rds_ibdev->has_fr && !rds_ibdev->has_fmr);
- rds_ibdev->fmr_max_remaps = dev_attr->max_map_per_fmr?: 32;
- rds_ibdev->max_1m_fmrs = dev_attr->max_mr ?
- min_t(unsigned int, (dev_attr->max_mr / 2),
- rds_ib_fmr_1m_pool_size) : rds_ib_fmr_1m_pool_size;
+ rds_ibdev->fmr_max_remaps = device->attrs.max_map_per_fmr?: 32;
+ rds_ibdev->max_1m_mrs = device->attrs.max_mr ?
+ min_t(unsigned int, (device->attrs.max_mr / 2),
+ rds_ib_mr_1m_pool_size) : rds_ib_mr_1m_pool_size;
- rds_ibdev->max_8k_fmrs = dev_attr->max_mr ?
- min_t(unsigned int, ((dev_attr->max_mr / 2) * RDS_MR_8K_SCALE),
- rds_ib_fmr_8k_pool_size) : rds_ib_fmr_8k_pool_size;
+ rds_ibdev->max_8k_mrs = device->attrs.max_mr ?
+ min_t(unsigned int, ((device->attrs.max_mr / 2) * RDS_MR_8K_SCALE),
+ rds_ib_mr_8k_pool_size) : rds_ib_mr_8k_pool_size;
- rds_ibdev->max_initiator_depth = dev_attr->max_qp_init_rd_atom;
- rds_ibdev->max_responder_resources = dev_attr->max_qp_rd_atom;
+ rds_ibdev->max_initiator_depth = device->attrs.max_qp_init_rd_atom;
+ rds_ibdev->max_responder_resources = device->attrs.max_qp_rd_atom;
rds_ibdev->dev = device;
rds_ibdev->pd = ib_alloc_pd(device);
@@ -182,10 +179,14 @@ static void rds_ib_add_one(struct ib_device *device)
goto put_dev;
}
- rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_fmrs = %d, max_8k_fmrs = %d\n",
- dev_attr->max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
- rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_fmrs,
- rds_ibdev->max_8k_fmrs);
+ rdsdebug("RDS/IB: max_mr = %d, max_wrs = %d, max_sge = %d, fmr_max_remaps = %d, max_1m_mrs = %d, max_8k_mrs = %d\n",
+ device->attrs.max_fmr, rds_ibdev->max_wrs, rds_ibdev->max_sge,
+ rds_ibdev->fmr_max_remaps, rds_ibdev->max_1m_mrs,
+ rds_ibdev->max_8k_mrs);
+
+ pr_info("RDS/IB: %s: %s supported and preferred\n",
+ device->name,
+ rds_ibdev->use_fastreg ? "FRMR" : "FMR");
INIT_LIST_HEAD(&rds_ibdev->ipaddr_list);
INIT_LIST_HEAD(&rds_ibdev->conn_list);
@@ -202,8 +203,6 @@ static void rds_ib_add_one(struct ib_device *device)
put_dev:
rds_ib_dev_put(rds_ibdev);
-free_attr:
- kfree(dev_attr);
}
/*
@@ -336,7 +335,7 @@ static int rds_ib_laddr_check(struct net *net, __be32 addr)
/* Create a CMA ID and try to bind it. This catches both
* IB and iWARP capable NICs.
*/
- cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
+ cm_id = rdma_create_id(&init_net, NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cm_id))
return PTR_ERR(cm_id);
@@ -376,7 +375,7 @@ void rds_ib_exit(void)
rds_ib_sysctl_exit();
rds_ib_recv_exit();
rds_trans_unregister(&rds_ib_transport);
- rds_ib_fmr_exit();
+ rds_ib_mr_exit();
}
struct rds_transport rds_ib_transport = {
@@ -412,13 +411,13 @@ int rds_ib_init(void)
INIT_LIST_HEAD(&rds_ib_devices);
- ret = rds_ib_fmr_init();
+ ret = rds_ib_mr_init();
if (ret)
goto out;
ret = ib_register_client(&rds_ib_client);
if (ret)
- goto out_fmr_exit;
+ goto out_mr_exit;
ret = rds_ib_sysctl_init();
if (ret)
@@ -442,8 +441,8 @@ out_sysctl:
rds_ib_sysctl_exit();
out_ibreg:
rds_ib_unregister_client();
-out_fmr_exit:
- rds_ib_fmr_exit();
+out_mr_exit:
+ rds_ib_mr_exit();
out:
return ret;
}
diff --git a/net/rds/ib.h b/net/rds/ib.h
index f17d09567890..627fb79aee65 100644
--- a/net/rds/ib.h
+++ b/net/rds/ib.h
@@ -9,17 +9,12 @@
#include "rds.h"
#include "rdma_transport.h"
-#define RDS_FMR_1M_POOL_SIZE (8192 / 2)
-#define RDS_FMR_1M_MSG_SIZE 256
-#define RDS_FMR_8K_MSG_SIZE 2
-#define RDS_MR_8K_SCALE (256 / (RDS_FMR_8K_MSG_SIZE + 1))
-#define RDS_FMR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2))
-
#define RDS_IB_MAX_SGE 8
#define RDS_IB_RECV_SGE 2
#define RDS_IB_DEFAULT_RECV_WR 1024
#define RDS_IB_DEFAULT_SEND_WR 256
+#define RDS_IB_DEFAULT_FR_WR 512
#define RDS_IB_DEFAULT_RETRY_COUNT 2
@@ -28,7 +23,6 @@
#define RDS_IB_RECYCLE_BATCH_COUNT 32
#define RDS_IB_WC_MAX 32
-#define RDS_IB_SEND_OP BIT_ULL(63)
extern struct rw_semaphore rds_ib_devices_lock;
extern struct list_head rds_ib_devices;
@@ -75,7 +69,11 @@ struct rds_ib_connect_private {
struct rds_ib_send_work {
void *s_op;
- struct ib_send_wr s_wr;
+ union {
+ struct ib_send_wr s_wr;
+ struct ib_rdma_wr s_rdma_wr;
+ struct ib_atomic_wr s_atomic_wr;
+ };
struct ib_sge s_sge[RDS_IB_MAX_SGE];
unsigned long s_queued;
};
@@ -125,6 +123,9 @@ struct rds_ib_connection {
struct ib_wc i_send_wc[RDS_IB_WC_MAX];
struct ib_wc i_recv_wc[RDS_IB_WC_MAX];
+ /* To control the number of wrs from fastreg */
+ atomic_t i_fastreg_wrs;
+
/* interrupt handling */
struct tasklet_struct i_send_tasklet;
struct tasklet_struct i_recv_tasklet;
@@ -203,12 +204,16 @@ struct rds_ib_device {
struct list_head conn_list;
struct ib_device *dev;
struct ib_pd *pd;
- unsigned int max_fmrs;
+ bool has_fmr;
+ bool has_fr;
+ bool use_fastreg;
+
+ unsigned int max_mrs;
struct rds_ib_mr_pool *mr_1m_pool;
struct rds_ib_mr_pool *mr_8k_pool;
unsigned int fmr_max_remaps;
- unsigned int max_8k_fmrs;
- unsigned int max_1m_fmrs;
+ unsigned int max_8k_mrs;
+ unsigned int max_1m_mrs;
int max_sge;
unsigned int max_wrs;
unsigned int max_initiator_depth;
@@ -262,6 +267,8 @@ struct rds_ib_statistics {
uint64_t s_ib_rdma_mr_1m_pool_flush;
uint64_t s_ib_rdma_mr_1m_pool_wait;
uint64_t s_ib_rdma_mr_1m_pool_depleted;
+ uint64_t s_ib_rdma_mr_8k_reused;
+ uint64_t s_ib_rdma_mr_1m_reused;
uint64_t s_ib_atomic_cswp;
uint64_t s_ib_atomic_fadd;
};
@@ -313,8 +320,6 @@ struct rds_ib_device *rds_ib_get_client_data(struct ib_device *device);
void rds_ib_dev_put(struct rds_ib_device *rds_ibdev);
extern struct ib_client rds_ib_client;
-extern unsigned int rds_ib_fmr_1m_pool_size;
-extern unsigned int rds_ib_fmr_8k_pool_size;
extern unsigned int rds_ib_retry_count;
extern spinlock_t ib_nodev_conns_lock;
@@ -344,17 +349,7 @@ int rds_ib_update_ipaddr(struct rds_ib_device *rds_ibdev, __be32 ipaddr);
void rds_ib_add_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_remove_conn(struct rds_ib_device *rds_ibdev, struct rds_connection *conn);
void rds_ib_destroy_nodev_conns(void);
-struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
- int npages);
-void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo);
-void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
-void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
- struct rds_sock *rs, u32 *key_ret);
-void rds_ib_sync_mr(void *trans_private, int dir);
-void rds_ib_free_mr(void *trans_private, int invalidate);
-void rds_ib_flush_mrs(void);
-int rds_ib_fmr_init(void);
-void rds_ib_fmr_exit(void);
+void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc);
/* ib_recv.c */
int rds_ib_recv_init(void);
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 2b2370e7f356..8764970f0c24 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -236,12 +236,10 @@ static void rds_ib_cq_comp_handler_recv(struct ib_cq *cq, void *context)
tasklet_schedule(&ic->i_recv_tasklet);
}
-static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq,
- struct ib_wc *wcs,
- struct rds_ib_ack_state *ack_state)
+static void poll_scq(struct rds_ib_connection *ic, struct ib_cq *cq,
+ struct ib_wc *wcs)
{
- int nr;
- int i;
+ int nr, i;
struct ib_wc *wc;
while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
@@ -251,10 +249,12 @@ static void poll_cq(struct rds_ib_connection *ic, struct ib_cq *cq,
(unsigned long long)wc->wr_id, wc->status,
wc->byte_len, be32_to_cpu(wc->ex.imm_data));
- if (wc->wr_id & RDS_IB_SEND_OP)
+ if (wc->wr_id <= ic->i_send_ring.w_nr ||
+ wc->wr_id == RDS_IB_ACK_WR_ID)
rds_ib_send_cqe_handler(ic, wc);
else
- rds_ib_recv_cqe_handler(ic, wc, ack_state);
+ rds_ib_mr_cqe_handler(ic, wc);
+
}
}
}
@@ -263,14 +263,12 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
{
struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
struct rds_connection *conn = ic->conn;
- struct rds_ib_ack_state state;
rds_ib_stats_inc(s_ib_tasklet_call);
- memset(&state, 0, sizeof(state));
- poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state);
+ poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
ib_req_notify_cq(ic->i_send_cq, IB_CQ_NEXT_COMP);
- poll_cq(ic, ic->i_send_cq, ic->i_send_wc, &state);
+ poll_scq(ic, ic->i_send_cq, ic->i_send_wc);
if (rds_conn_up(conn) &&
(!test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
@@ -278,6 +276,25 @@ static void rds_ib_tasklet_fn_send(unsigned long data)
rds_send_xmit(ic->conn);
}
+static void poll_rcq(struct rds_ib_connection *ic, struct ib_cq *cq,
+ struct ib_wc *wcs,
+ struct rds_ib_ack_state *ack_state)
+{
+ int nr, i;
+ struct ib_wc *wc;
+
+ while ((nr = ib_poll_cq(cq, RDS_IB_WC_MAX, wcs)) > 0) {
+ for (i = 0; i < nr; i++) {
+ wc = wcs + i;
+ rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
+ (unsigned long long)wc->wr_id, wc->status,
+ wc->byte_len, be32_to_cpu(wc->ex.imm_data));
+
+ rds_ib_recv_cqe_handler(ic, wc, ack_state);
+ }
+ }
+}
+
static void rds_ib_tasklet_fn_recv(unsigned long data)
{
struct rds_ib_connection *ic = (struct rds_ib_connection *)data;
@@ -291,9 +308,9 @@ static void rds_ib_tasklet_fn_recv(unsigned long data)
rds_ib_stats_inc(s_ib_tasklet_call);
memset(&state, 0, sizeof(state));
- poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
+ poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
- poll_cq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
+ poll_rcq(ic, ic->i_recv_cq, ic->i_recv_wc, &state);
if (state.ack_next_valid)
rds_ib_set_ack(ic, state.ack_next, state.ack_required);
@@ -351,7 +368,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
struct ib_qp_init_attr attr;
struct ib_cq_init_attr cq_attr = {};
struct rds_ib_device *rds_ibdev;
- int ret;
+ int ret, fr_queue_space;
/*
* It's normal to see a null device if an incoming connection races
@@ -361,6 +378,12 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
if (!rds_ibdev)
return -EOPNOTSUPP;
+ /* The fr_queue_space is currently set to 512, to add extra space on
+ * completion queue and send queue. This extra space is used for FRMR
+ * registration and invalidation work requests
+ */
+ fr_queue_space = (rds_ibdev->use_fastreg ? RDS_IB_DEFAULT_FR_WR : 0);
+
/* add the conn now so that connection establishment has the dev */
rds_ib_add_conn(rds_ibdev, conn);
@@ -372,7 +395,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
/* Protection domain and memory range */
ic->i_pd = rds_ibdev->pd;
- cq_attr.cqe = ic->i_send_ring.w_nr + 1;
+ cq_attr.cqe = ic->i_send_ring.w_nr + fr_queue_space + 1;
ic->i_send_cq = ib_create_cq(dev, rds_ib_cq_comp_handler_send,
rds_ib_cq_event_handler, conn,
@@ -412,7 +435,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
attr.event_handler = rds_ib_qp_event_handler;
attr.qp_context = conn;
/* + 1 to allow for the single ack message */
- attr.cap.max_send_wr = ic->i_send_ring.w_nr + 1;
+ attr.cap.max_send_wr = ic->i_send_ring.w_nr + fr_queue_space + 1;
attr.cap.max_recv_wr = ic->i_recv_ring.w_nr + 1;
attr.cap.max_send_sge = rds_ibdev->max_sge;
attr.cap.max_recv_sge = RDS_IB_RECV_SGE;
@@ -420,6 +443,7 @@ static int rds_ib_setup_qp(struct rds_connection *conn)
attr.qp_type = IB_QPT_RC;
attr.send_cq = ic->i_send_cq;
attr.recv_cq = ic->i_recv_cq;
+ atomic_set(&ic->i_fastreg_wrs, RDS_IB_DEFAULT_FR_WR);
/*
* XXX this can fail if max_*_wr is too large? Are we supposed
@@ -668,7 +692,7 @@ int rds_ib_conn_connect(struct rds_connection *conn)
/* XXX I wonder what affect the port space has */
/* delegate cm event handler to rdma_transport */
- ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
+ ic->i_cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, conn,
RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(ic->i_cm_id)) {
ret = PTR_ERR(ic->i_cm_id);
@@ -739,7 +763,8 @@ void rds_ib_conn_shutdown(struct rds_connection *conn)
*/
wait_event(rds_ib_ring_empty_wait,
rds_ib_ring_empty(&ic->i_recv_ring) &&
- (atomic_read(&ic->i_signaled_sends) == 0));
+ (atomic_read(&ic->i_signaled_sends) == 0) &&
+ (atomic_read(&ic->i_fastreg_wrs) == RDS_IB_DEFAULT_FR_WR));
tasklet_kill(&ic->i_send_tasklet);
tasklet_kill(&ic->i_recv_tasklet);
diff --git a/net/rds/ib_fmr.c b/net/rds/ib_fmr.c
new file mode 100644
index 000000000000..4fe8f4fec4ee
--- /dev/null
+++ b/net/rds/ib_fmr.c
@@ -0,0 +1,248 @@
+/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ib_mr.h"
+
+struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev, int npages)
+{
+ struct rds_ib_mr_pool *pool;
+ struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_fmr *fmr;
+ int err = 0;
+
+ if (npages <= RDS_MR_8K_MSG_SIZE)
+ pool = rds_ibdev->mr_8k_pool;
+ else
+ pool = rds_ibdev->mr_1m_pool;
+
+ ibmr = rds_ib_try_reuse_ibmr(pool);
+ if (ibmr)
+ return ibmr;
+
+ ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
+ rdsibdev_to_node(rds_ibdev));
+ if (!ibmr) {
+ err = -ENOMEM;
+ goto out_no_cigar;
+ }
+
+ fmr = &ibmr->u.fmr;
+ fmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
+ (IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE |
+ IB_ACCESS_REMOTE_ATOMIC),
+ &pool->fmr_attr);
+ if (IS_ERR(fmr->fmr)) {
+ err = PTR_ERR(fmr->fmr);
+ fmr->fmr = NULL;
+ pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, err);
+ goto out_no_cigar;
+ }
+
+ ibmr->pool = pool;
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
+
+ return ibmr;
+
+out_no_cigar:
+ if (ibmr) {
+ if (fmr->fmr)
+ ib_dealloc_fmr(fmr->fmr);
+ kfree(ibmr);
+ }
+ atomic_dec(&pool->item_count);
+ return ERR_PTR(err);
+}
+
+int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
+ struct scatterlist *sg, unsigned int nents)
+{
+ struct ib_device *dev = rds_ibdev->dev;
+ struct rds_ib_fmr *fmr = &ibmr->u.fmr;
+ struct scatterlist *scat = sg;
+ u64 io_addr = 0;
+ u64 *dma_pages;
+ u32 len;
+ int page_cnt, sg_dma_len;
+ int i, j;
+ int ret;
+
+ sg_dma_len = ib_dma_map_sg(dev, sg, nents, DMA_BIDIRECTIONAL);
+ if (unlikely(!sg_dma_len)) {
+ pr_warn("RDS/IB: %s failed!\n", __func__);
+ return -EBUSY;
+ }
+
+ len = 0;
+ page_cnt = 0;
+
+ for (i = 0; i < sg_dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+ if (dma_addr & ~PAGE_MASK) {
+ if (i > 0)
+ return -EINVAL;
+ else
+ ++page_cnt;
+ }
+ if ((dma_addr + dma_len) & ~PAGE_MASK) {
+ if (i < sg_dma_len - 1)
+ return -EINVAL;
+ else
+ ++page_cnt;
+ }
+
+ len += dma_len;
+ }
+
+ page_cnt += len >> PAGE_SHIFT;
+ if (page_cnt > ibmr->pool->fmr_attr.max_pages)
+ return -EINVAL;
+
+ dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
+ rdsibdev_to_node(rds_ibdev));
+ if (!dma_pages)
+ return -ENOMEM;
+
+ page_cnt = 0;
+ for (i = 0; i < sg_dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
+
+ for (j = 0; j < dma_len; j += PAGE_SIZE)
+ dma_pages[page_cnt++] =
+ (dma_addr & PAGE_MASK) + j;
+ }
+
+ ret = ib_map_phys_fmr(fmr->fmr, dma_pages, page_cnt, io_addr);
+ if (ret)
+ goto out;
+
+ /* Success - we successfully remapped the MR, so we can
+ * safely tear down the old mapping.
+ */
+ rds_ib_teardown_mr(ibmr);
+
+ ibmr->sg = scat;
+ ibmr->sg_len = nents;
+ ibmr->sg_dma_len = sg_dma_len;
+ ibmr->remap_count++;
+
+ if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
+ ret = 0;
+
+out:
+ kfree(dma_pages);
+
+ return ret;
+}
+
+struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *rds_ibdev,
+ struct scatterlist *sg,
+ unsigned long nents,
+ u32 *key)
+{
+ struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_fmr *fmr;
+ int ret;
+
+ ibmr = rds_ib_alloc_fmr(rds_ibdev, nents);
+ if (IS_ERR(ibmr))
+ return ibmr;
+
+ ibmr->device = rds_ibdev;
+ fmr = &ibmr->u.fmr;
+ ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
+ if (ret == 0)
+ *key = fmr->fmr->rkey;
+ else
+ rds_ib_free_mr(ibmr, 0);
+
+ return ibmr;
+}
+
+void rds_ib_unreg_fmr(struct list_head *list, unsigned int *nfreed,
+ unsigned long *unpinned, unsigned int goal)
+{
+ struct rds_ib_mr *ibmr, *next;
+ struct rds_ib_fmr *fmr;
+ LIST_HEAD(fmr_list);
+ int ret = 0;
+ unsigned int freed = *nfreed;
+
+ /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
+ list_for_each_entry(ibmr, list, unmap_list) {
+ fmr = &ibmr->u.fmr;
+ list_add(&fmr->fmr->list, &fmr_list);
+ }
+
+ ret = ib_unmap_fmr(&fmr_list);
+ if (ret)
+ pr_warn("RDS/IB: FMR invalidation failed (err=%d)\n", ret);
+
+ /* Now we can destroy the DMA mapping and unpin any pages */
+ list_for_each_entry_safe(ibmr, next, list, unmap_list) {
+ fmr = &ibmr->u.fmr;
+ *unpinned += ibmr->sg_len;
+ __rds_ib_teardown_mr(ibmr);
+ if (freed < goal ||
+ ibmr->remap_count >= ibmr->pool->fmr_attr.max_maps) {
+ if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
+ list_del(&ibmr->unmap_list);
+ ib_dealloc_fmr(fmr->fmr);
+ kfree(ibmr);
+ freed++;
+ }
+ }
+ *nfreed = freed;
+}
+
+void rds_ib_free_fmr_list(struct rds_ib_mr *ibmr)
+{
+ struct rds_ib_mr_pool *pool = ibmr->pool;
+
+ if (ibmr->remap_count >= pool->fmr_attr.max_maps)
+ llist_add(&ibmr->llnode, &pool->drop_list);
+ else
+ llist_add(&ibmr->llnode, &pool->free_list);
+}
diff --git a/net/rds/ib_frmr.c b/net/rds/ib_frmr.c
new file mode 100644
index 000000000000..93ff038ea9d1
--- /dev/null
+++ b/net/rds/ib_frmr.c
@@ -0,0 +1,376 @@
+/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "ib_mr.h"
+
+static struct rds_ib_mr *rds_ib_alloc_frmr(struct rds_ib_device *rds_ibdev,
+ int npages)
+{
+ struct rds_ib_mr_pool *pool;
+ struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_frmr *frmr;
+ int err = 0;
+
+ if (npages <= RDS_MR_8K_MSG_SIZE)
+ pool = rds_ibdev->mr_8k_pool;
+ else
+ pool = rds_ibdev->mr_1m_pool;
+
+ ibmr = rds_ib_try_reuse_ibmr(pool);
+ if (ibmr)
+ return ibmr;
+
+ ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL,
+ rdsibdev_to_node(rds_ibdev));
+ if (!ibmr) {
+ err = -ENOMEM;
+ goto out_no_cigar;
+ }
+
+ frmr = &ibmr->u.frmr;
+ frmr->mr = ib_alloc_mr(rds_ibdev->pd, IB_MR_TYPE_MEM_REG,
+ pool->fmr_attr.max_pages);
+ if (IS_ERR(frmr->mr)) {
+ pr_warn("RDS/IB: %s failed to allocate MR", __func__);
+ goto out_no_cigar;
+ }
+
+ ibmr->pool = pool;
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
+
+ if (atomic_read(&pool->item_count) > pool->max_items_soft)
+ pool->max_items_soft = pool->max_items;
+
+ frmr->fr_state = FRMR_IS_FREE;
+ return ibmr;
+
+out_no_cigar:
+ kfree(ibmr);
+ atomic_dec(&pool->item_count);
+ return ERR_PTR(err);
+}
+
+static void rds_ib_free_frmr(struct rds_ib_mr *ibmr, bool drop)
+{
+ struct rds_ib_mr_pool *pool = ibmr->pool;
+
+ if (drop)
+ llist_add(&ibmr->llnode, &pool->drop_list);
+ else
+ llist_add(&ibmr->llnode, &pool->free_list);
+ atomic_add(ibmr->sg_len, &pool->free_pinned);
+ atomic_inc(&pool->dirty_count);
+
+ /* If we've pinned too many pages, request a flush */
+ if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
+ atomic_read(&pool->dirty_count) >= pool->max_items / 5)
+ queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
+}
+
+static int rds_ib_post_reg_frmr(struct rds_ib_mr *ibmr)
+{
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+ struct ib_send_wr *failed_wr;
+ struct ib_reg_wr reg_wr;
+ int ret;
+
+ while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
+ atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ cpu_relax();
+ }
+
+ ret = ib_map_mr_sg_zbva(frmr->mr, ibmr->sg, ibmr->sg_len, PAGE_SIZE);
+ if (unlikely(ret != ibmr->sg_len))
+ return ret < 0 ? ret : -EINVAL;
+
+ /* Perform a WR for the fast_reg_mr. Each individual page
+ * in the sg list is added to the fast reg page list and placed
+ * inside the fast_reg_mr WR. The key used is a rolling 8bit
+ * counter, which should guarantee uniqueness.
+ */
+ ib_update_fast_reg_key(frmr->mr, ibmr->remap_count++);
+ frmr->fr_state = FRMR_IS_INUSE;
+
+ memset(&reg_wr, 0, sizeof(reg_wr));
+ reg_wr.wr.wr_id = (unsigned long)(void *)ibmr;
+ reg_wr.wr.opcode = IB_WR_REG_MR;
+ reg_wr.wr.num_sge = 0;
+ reg_wr.mr = frmr->mr;
+ reg_wr.key = frmr->mr->rkey;
+ reg_wr.access = IB_ACCESS_LOCAL_WRITE |
+ IB_ACCESS_REMOTE_READ |
+ IB_ACCESS_REMOTE_WRITE;
+ reg_wr.wr.send_flags = IB_SEND_SIGNALED;
+
+ failed_wr = &reg_wr.wr;
+ ret = ib_post_send(ibmr->ic->i_cm_id->qp, &reg_wr.wr, &failed_wr);
+ WARN_ON(failed_wr != &reg_wr.wr);
+ if (unlikely(ret)) {
+ /* Failure here can be because of -ENOMEM as well */
+ frmr->fr_state = FRMR_IS_STALE;
+ atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ if (printk_ratelimit())
+ pr_warn("RDS/IB: %s returned error(%d)\n",
+ __func__, ret);
+ }
+ return ret;
+}
+
+static int rds_ib_map_frmr(struct rds_ib_device *rds_ibdev,
+ struct rds_ib_mr_pool *pool,
+ struct rds_ib_mr *ibmr,
+ struct scatterlist *sg, unsigned int sg_len)
+{
+ struct ib_device *dev = rds_ibdev->dev;
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+ int i;
+ u32 len;
+ int ret = 0;
+
+ /* We want to teardown old ibmr values here and fill it up with
+ * new sg values
+ */
+ rds_ib_teardown_mr(ibmr);
+
+ ibmr->sg = sg;
+ ibmr->sg_len = sg_len;
+ ibmr->sg_dma_len = 0;
+ frmr->sg_byte_len = 0;
+ WARN_ON(ibmr->sg_dma_len);
+ ibmr->sg_dma_len = ib_dma_map_sg(dev, ibmr->sg, ibmr->sg_len,
+ DMA_BIDIRECTIONAL);
+ if (unlikely(!ibmr->sg_dma_len)) {
+ pr_warn("RDS/IB: %s failed!\n", __func__);
+ return -EBUSY;
+ }
+
+ frmr->sg_byte_len = 0;
+ frmr->dma_npages = 0;
+ len = 0;
+
+ ret = -EINVAL;
+ for (i = 0; i < ibmr->sg_dma_len; ++i) {
+ unsigned int dma_len = ib_sg_dma_len(dev, &ibmr->sg[i]);
+ u64 dma_addr = ib_sg_dma_address(dev, &ibmr->sg[i]);
+
+ frmr->sg_byte_len += dma_len;
+ if (dma_addr & ~PAGE_MASK) {
+ if (i > 0)
+ goto out_unmap;
+ else
+ ++frmr->dma_npages;
+ }
+
+ if ((dma_addr + dma_len) & ~PAGE_MASK) {
+ if (i < ibmr->sg_dma_len - 1)
+ goto out_unmap;
+ else
+ ++frmr->dma_npages;
+ }
+
+ len += dma_len;
+ }
+ frmr->dma_npages += len >> PAGE_SHIFT;
+
+ if (frmr->dma_npages > ibmr->pool->fmr_attr.max_pages) {
+ ret = -EMSGSIZE;
+ goto out_unmap;
+ }
+
+ ret = rds_ib_post_reg_frmr(ibmr);
+ if (ret)
+ goto out_unmap;
+
+ if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
+
+ return ret;
+
+out_unmap:
+ ib_dma_unmap_sg(rds_ibdev->dev, ibmr->sg, ibmr->sg_len,
+ DMA_BIDIRECTIONAL);
+ ibmr->sg_dma_len = 0;
+ return ret;
+}
+
+static int rds_ib_post_inv(struct rds_ib_mr *ibmr)
+{
+ struct ib_send_wr *s_wr, *failed_wr;
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+ struct rdma_cm_id *i_cm_id = ibmr->ic->i_cm_id;
+ int ret = -EINVAL;
+
+ if (!i_cm_id || !i_cm_id->qp || !frmr->mr)
+ goto out;
+
+ if (frmr->fr_state != FRMR_IS_INUSE)
+ goto out;
+
+ while (atomic_dec_return(&ibmr->ic->i_fastreg_wrs) <= 0) {
+ atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ cpu_relax();
+ }
+
+ frmr->fr_inv = true;
+ s_wr = &frmr->fr_wr;
+
+ memset(s_wr, 0, sizeof(*s_wr));
+ s_wr->wr_id = (unsigned long)(void *)ibmr;
+ s_wr->opcode = IB_WR_LOCAL_INV;
+ s_wr->ex.invalidate_rkey = frmr->mr->rkey;
+ s_wr->send_flags = IB_SEND_SIGNALED;
+
+ failed_wr = s_wr;
+ ret = ib_post_send(i_cm_id->qp, s_wr, &failed_wr);
+ WARN_ON(failed_wr != s_wr);
+ if (unlikely(ret)) {
+ frmr->fr_state = FRMR_IS_STALE;
+ frmr->fr_inv = false;
+ atomic_inc(&ibmr->ic->i_fastreg_wrs);
+ pr_err("RDS/IB: %s returned error(%d)\n", __func__, ret);
+ goto out;
+ }
+out:
+ return ret;
+}
+
+void rds_ib_mr_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
+{
+ struct rds_ib_mr *ibmr = (void *)(unsigned long)wc->wr_id;
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+
+ if (wc->status != IB_WC_SUCCESS) {
+ frmr->fr_state = FRMR_IS_STALE;
+ if (rds_conn_up(ic->conn))
+ rds_ib_conn_error(ic->conn,
+ "frmr completion <%pI4,%pI4> status %u(%s), vendor_err 0x%x, disconnecting and reconnecting\n",
+ &ic->conn->c_laddr,
+ &ic->conn->c_faddr,
+ wc->status,
+ ib_wc_status_msg(wc->status),
+ wc->vendor_err);
+ }
+
+ if (frmr->fr_inv) {
+ frmr->fr_state = FRMR_IS_FREE;
+ frmr->fr_inv = false;
+ }
+
+ atomic_inc(&ic->i_fastreg_wrs);
+}
+
+void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
+ unsigned long *unpinned, unsigned int goal)
+{
+ struct rds_ib_mr *ibmr, *next;
+ struct rds_ib_frmr *frmr;
+ int ret = 0;
+ unsigned int freed = *nfreed;
+
+ /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
+ list_for_each_entry(ibmr, list, unmap_list) {
+ if (ibmr->sg_dma_len)
+ ret |= rds_ib_post_inv(ibmr);
+ }
+ if (ret)
+ pr_warn("RDS/IB: %s failed (err=%d)\n", __func__, ret);
+
+ /* Now we can destroy the DMA mapping and unpin any pages */
+ list_for_each_entry_safe(ibmr, next, list, unmap_list) {
+ *unpinned += ibmr->sg_len;
+ frmr = &ibmr->u.frmr;
+ __rds_ib_teardown_mr(ibmr);
+ if (freed < goal || frmr->fr_state == FRMR_IS_STALE) {
+ /* Don't de-allocate if the MR is not free yet */
+ if (frmr->fr_state == FRMR_IS_INUSE)
+ continue;
+
+ if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
+ list_del(&ibmr->unmap_list);
+ if (frmr->mr)
+ ib_dereg_mr(frmr->mr);
+ kfree(ibmr);
+ freed++;
+ }
+ }
+ *nfreed = freed;
+}
+
+struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
+ struct rds_ib_connection *ic,
+ struct scatterlist *sg,
+ unsigned long nents, u32 *key)
+{
+ struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_frmr *frmr;
+ int ret;
+
+ do {
+ if (ibmr)
+ rds_ib_free_frmr(ibmr, true);
+ ibmr = rds_ib_alloc_frmr(rds_ibdev, nents);
+ if (IS_ERR(ibmr))
+ return ibmr;
+ frmr = &ibmr->u.frmr;
+ } while (frmr->fr_state != FRMR_IS_FREE);
+
+ ibmr->ic = ic;
+ ibmr->device = rds_ibdev;
+ ret = rds_ib_map_frmr(rds_ibdev, ibmr->pool, ibmr, sg, nents);
+ if (ret == 0) {
+ *key = frmr->mr->rkey;
+ } else {
+ rds_ib_free_frmr(ibmr, false);
+ ibmr = ERR_PTR(ret);
+ }
+
+ return ibmr;
+}
+
+void rds_ib_free_frmr_list(struct rds_ib_mr *ibmr)
+{
+ struct rds_ib_mr_pool *pool = ibmr->pool;
+ struct rds_ib_frmr *frmr = &ibmr->u.frmr;
+
+ if (frmr->fr_state == FRMR_IS_STALE)
+ llist_add(&ibmr->llnode, &pool->drop_list);
+ else
+ llist_add(&ibmr->llnode, &pool->free_list);
+}
diff --git a/net/rds/ib_mr.h b/net/rds/ib_mr.h
new file mode 100644
index 000000000000..1c754f4acbe5
--- /dev/null
+++ b/net/rds/ib_mr.h
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016 Oracle. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses. You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ * - Redistributions of source code must retain the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer.
+ *
+ * - Redistributions in binary form must reproduce the above
+ * copyright notice, this list of conditions and the following
+ * disclaimer in the documentation and/or other materials
+ * provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef _RDS_IB_MR_H
+#define _RDS_IB_MR_H
+
+#include <linux/kernel.h>
+
+#include "rds.h"
+#include "ib.h"
+
+#define RDS_MR_1M_POOL_SIZE (8192 / 2)
+#define RDS_MR_1M_MSG_SIZE 256
+#define RDS_MR_8K_MSG_SIZE 2
+#define RDS_MR_8K_SCALE (256 / (RDS_MR_8K_MSG_SIZE + 1))
+#define RDS_MR_8K_POOL_SIZE (RDS_MR_8K_SCALE * (8192 / 2))
+
+struct rds_ib_fmr {
+ struct ib_fmr *fmr;
+ u64 *dma;
+};
+
+enum rds_ib_fr_state {
+ FRMR_IS_FREE, /* mr invalidated & ready for use */
+ FRMR_IS_INUSE, /* mr is in use or used & can be invalidated */
+ FRMR_IS_STALE, /* Stale MR and needs to be dropped */
+};
+
+struct rds_ib_frmr {
+ struct ib_mr *mr;
+ enum rds_ib_fr_state fr_state;
+ bool fr_inv;
+ struct ib_send_wr fr_wr;
+ unsigned int dma_npages;
+ unsigned int sg_byte_len;
+};
+
+/* This is stored as mr->r_trans_private. */
+struct rds_ib_mr {
+ struct rds_ib_device *device;
+ struct rds_ib_mr_pool *pool;
+ struct rds_ib_connection *ic;
+
+ struct llist_node llnode;
+
+ /* unmap_list is for freeing */
+ struct list_head unmap_list;
+ unsigned int remap_count;
+
+ struct scatterlist *sg;
+ unsigned int sg_len;
+ int sg_dma_len;
+
+ union {
+ struct rds_ib_fmr fmr;
+ struct rds_ib_frmr frmr;
+ } u;
+};
+
+/* Our own little MR pool */
+struct rds_ib_mr_pool {
+ unsigned int pool_type;
+ struct mutex flush_lock; /* serialize fmr invalidate */
+ struct delayed_work flush_worker; /* flush worker */
+
+ atomic_t item_count; /* total # of MRs */
+ atomic_t dirty_count; /* # dirty of MRs */
+
+ struct llist_head drop_list; /* MRs not reached max_maps */
+ struct llist_head free_list; /* unused MRs */
+ struct llist_head clean_list; /* unused & unmapped MRs */
+ wait_queue_head_t flush_wait;
+
+ atomic_t free_pinned; /* memory pinned by free MRs */
+ unsigned long max_items;
+ unsigned long max_items_soft;
+ unsigned long max_free_pinned;
+ struct ib_fmr_attr fmr_attr;
+ bool use_fastreg;
+};
+
+extern struct workqueue_struct *rds_ib_mr_wq;
+extern unsigned int rds_ib_mr_1m_pool_size;
+extern unsigned int rds_ib_mr_8k_pool_size;
+extern bool prefer_frmr;
+
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_dev,
+ int npages);
+void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev,
+ struct rds_info_rdma_connection *iinfo);
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *);
+void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
+ struct rds_sock *rs, u32 *key_ret);
+void rds_ib_sync_mr(void *trans_private, int dir);
+void rds_ib_free_mr(void *trans_private, int invalidate);
+void rds_ib_flush_mrs(void);
+int rds_ib_mr_init(void);
+void rds_ib_mr_exit(void);
+
+void __rds_ib_teardown_mr(struct rds_ib_mr *);
+void rds_ib_teardown_mr(struct rds_ib_mr *);
+struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *, int);
+int rds_ib_map_fmr(struct rds_ib_device *, struct rds_ib_mr *,
+ struct scatterlist *, unsigned int);
+struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *);
+int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *, int, struct rds_ib_mr **);
+struct rds_ib_mr *rds_ib_reg_fmr(struct rds_ib_device *, struct scatterlist *,
+ unsigned long, u32 *);
+struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *);
+void rds_ib_unreg_fmr(struct list_head *, unsigned int *,
+ unsigned long *, unsigned int);
+void rds_ib_free_fmr_list(struct rds_ib_mr *);
+struct rds_ib_mr *rds_ib_reg_frmr(struct rds_ib_device *rds_ibdev,
+ struct rds_ib_connection *ic,
+ struct scatterlist *sg,
+ unsigned long nents, u32 *key);
+void rds_ib_unreg_frmr(struct list_head *list, unsigned int *nfreed,
+ unsigned long *unpinned, unsigned int goal);
+void rds_ib_free_frmr_list(struct rds_ib_mr *);
+#endif
diff --git a/net/rds/ib_rdma.c b/net/rds/ib_rdma.c
index a2340748ec86..f7164ac1ffc1 100644
--- a/net/rds/ib_rdma.c
+++ b/net/rds/ib_rdma.c
@@ -35,78 +35,13 @@
#include <linux/rculist.h>
#include <linux/llist.h>
-#include "rds.h"
-#include "ib.h"
+#include "ib_mr.h"
+
+struct workqueue_struct *rds_ib_mr_wq;
static DEFINE_PER_CPU(unsigned long, clean_list_grace);
#define CLEAN_LIST_BUSY_BIT 0
-/*
- * This is stored as mr->r_trans_private.
- */
-struct rds_ib_mr {
- struct rds_ib_device *device;
- struct rds_ib_mr_pool *pool;
- struct ib_fmr *fmr;
-
- struct llist_node llnode;
-
- /* unmap_list is for freeing */
- struct list_head unmap_list;
- unsigned int remap_count;
-
- struct scatterlist *sg;
- unsigned int sg_len;
- u64 *dma;
- int sg_dma_len;
-};
-
-/*
- * Our own little FMR pool
- */
-struct rds_ib_mr_pool {
- unsigned int pool_type;
- struct mutex flush_lock; /* serialize fmr invalidate */
- struct delayed_work flush_worker; /* flush worker */
-
- atomic_t item_count; /* total # of MRs */
- atomic_t dirty_count; /* # dirty of MRs */
-
- struct llist_head drop_list; /* MRs that have reached their max_maps limit */
- struct llist_head free_list; /* unused MRs */
- struct llist_head clean_list; /* global unused & unamapped MRs */
- wait_queue_head_t flush_wait;
-
- atomic_t free_pinned; /* memory pinned by free MRs */
- unsigned long max_items;
- unsigned long max_items_soft;
- unsigned long max_free_pinned;
- struct ib_fmr_attr fmr_attr;
-};
-
-static struct workqueue_struct *rds_ib_fmr_wq;
-
-int rds_ib_fmr_init(void)
-{
- rds_ib_fmr_wq = create_workqueue("rds_fmr_flushd");
- if (!rds_ib_fmr_wq)
- return -ENOMEM;
- return 0;
-}
-
-/* By the time this is called all the IB devices should have been torn down and
- * had their pools freed. As each pool is freed its work struct is waited on,
- * so the pool flushing work queue should be idle by the time we get here.
- */
-void rds_ib_fmr_exit(void)
-{
- destroy_workqueue(rds_ib_fmr_wq);
-}
-
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool, int free_all, struct rds_ib_mr **);
-static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr);
-static void rds_ib_mr_pool_flush_worker(struct work_struct *work);
-
static struct rds_ib_device *rds_ib_get_device(__be32 ipaddr)
{
struct rds_ib_device *rds_ibdev;
@@ -235,41 +170,6 @@ void rds_ib_destroy_nodev_conns(void)
rds_conn_destroy(ic->conn);
}
-struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
- int pool_type)
-{
- struct rds_ib_mr_pool *pool;
-
- pool = kzalloc(sizeof(*pool), GFP_KERNEL);
- if (!pool)
- return ERR_PTR(-ENOMEM);
-
- pool->pool_type = pool_type;
- init_llist_head(&pool->free_list);
- init_llist_head(&pool->drop_list);
- init_llist_head(&pool->clean_list);
- mutex_init(&pool->flush_lock);
- init_waitqueue_head(&pool->flush_wait);
- INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
-
- if (pool_type == RDS_IB_MR_1M_POOL) {
- /* +1 allows for unaligned MRs */
- pool->fmr_attr.max_pages = RDS_FMR_1M_MSG_SIZE + 1;
- pool->max_items = RDS_FMR_1M_POOL_SIZE;
- } else {
- /* pool_type == RDS_IB_MR_8K_POOL */
- pool->fmr_attr.max_pages = RDS_FMR_8K_MSG_SIZE + 1;
- pool->max_items = RDS_FMR_8K_POOL_SIZE;
- }
-
- pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4;
- pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
- pool->fmr_attr.page_shift = PAGE_SHIFT;
- pool->max_items_soft = rds_ibdev->max_fmrs * 3 / 4;
-
- return pool;
-}
-
void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_connection *iinfo)
{
struct rds_ib_mr_pool *pool_1m = rds_ibdev->mr_1m_pool;
@@ -278,16 +178,7 @@ void rds_ib_get_mr_info(struct rds_ib_device *rds_ibdev, struct rds_info_rdma_co
iinfo->rdma_mr_size = pool_1m->fmr_attr.max_pages;
}
-void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
-{
- cancel_delayed_work_sync(&pool->flush_worker);
- rds_ib_flush_mr_pool(pool, 1, NULL);
- WARN_ON(atomic_read(&pool->item_count));
- WARN_ON(atomic_read(&pool->free_pinned));
- kfree(pool);
-}
-
-static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
+struct rds_ib_mr *rds_ib_reuse_mr(struct rds_ib_mr_pool *pool)
{
struct rds_ib_mr *ibmr = NULL;
struct llist_node *ret;
@@ -297,8 +188,13 @@ static inline struct rds_ib_mr *rds_ib_reuse_fmr(struct rds_ib_mr_pool *pool)
flag = this_cpu_ptr(&clean_list_grace);
set_bit(CLEAN_LIST_BUSY_BIT, flag);
ret = llist_del_first(&pool->clean_list);
- if (ret)
+ if (ret) {
ibmr = llist_entry(ret, struct rds_ib_mr, llnode);
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_reused);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_reused);
+ }
clear_bit(CLEAN_LIST_BUSY_BIT, flag);
preempt_enable();
@@ -317,190 +213,6 @@ static inline void wait_clean_list_grace(void)
}
}
-static struct rds_ib_mr *rds_ib_alloc_fmr(struct rds_ib_device *rds_ibdev,
- int npages)
-{
- struct rds_ib_mr_pool *pool;
- struct rds_ib_mr *ibmr = NULL;
- int err = 0, iter = 0;
-
- if (npages <= RDS_FMR_8K_MSG_SIZE)
- pool = rds_ibdev->mr_8k_pool;
- else
- pool = rds_ibdev->mr_1m_pool;
-
- if (atomic_read(&pool->dirty_count) >= pool->max_items / 10)
- queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
-
- /* Switch pools if one of the pool is reaching upper limit */
- if (atomic_read(&pool->dirty_count) >= pool->max_items * 9 / 10) {
- if (pool->pool_type == RDS_IB_MR_8K_POOL)
- pool = rds_ibdev->mr_1m_pool;
- else
- pool = rds_ibdev->mr_8k_pool;
- }
-
- while (1) {
- ibmr = rds_ib_reuse_fmr(pool);
- if (ibmr)
- return ibmr;
-
- /* No clean MRs - now we have the choice of either
- * allocating a fresh MR up to the limit imposed by the
- * driver, or flush any dirty unused MRs.
- * We try to avoid stalling in the send path if possible,
- * so we allocate as long as we're allowed to.
- *
- * We're fussy with enforcing the FMR limit, though. If the driver
- * tells us we can't use more than N fmrs, we shouldn't start
- * arguing with it */
- if (atomic_inc_return(&pool->item_count) <= pool->max_items)
- break;
-
- atomic_dec(&pool->item_count);
-
- if (++iter > 2) {
- if (pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
- return ERR_PTR(-EAGAIN);
- }
-
- /* We do have some empty MRs. Flush them out. */
- if (pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);
- rds_ib_flush_mr_pool(pool, 0, &ibmr);
- if (ibmr)
- return ibmr;
- }
-
- ibmr = kzalloc_node(sizeof(*ibmr), GFP_KERNEL, rdsibdev_to_node(rds_ibdev));
- if (!ibmr) {
- err = -ENOMEM;
- goto out_no_cigar;
- }
-
- ibmr->fmr = ib_alloc_fmr(rds_ibdev->pd,
- (IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_WRITE|
- IB_ACCESS_REMOTE_ATOMIC),
- &pool->fmr_attr);
- if (IS_ERR(ibmr->fmr)) {
- err = PTR_ERR(ibmr->fmr);
- ibmr->fmr = NULL;
- printk(KERN_WARNING "RDS/IB: ib_alloc_fmr failed (err=%d)\n", err);
- goto out_no_cigar;
- }
-
- ibmr->pool = pool;
- if (pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_alloc);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_alloc);
-
- return ibmr;
-
-out_no_cigar:
- if (ibmr) {
- if (ibmr->fmr)
- ib_dealloc_fmr(ibmr->fmr);
- kfree(ibmr);
- }
- atomic_dec(&pool->item_count);
- return ERR_PTR(err);
-}
-
-static int rds_ib_map_fmr(struct rds_ib_device *rds_ibdev, struct rds_ib_mr *ibmr,
- struct scatterlist *sg, unsigned int nents)
-{
- struct ib_device *dev = rds_ibdev->dev;
- struct scatterlist *scat = sg;
- u64 io_addr = 0;
- u64 *dma_pages;
- u32 len;
- int page_cnt, sg_dma_len;
- int i, j;
- int ret;
-
- sg_dma_len = ib_dma_map_sg(dev, sg, nents,
- DMA_BIDIRECTIONAL);
- if (unlikely(!sg_dma_len)) {
- printk(KERN_WARNING "RDS/IB: dma_map_sg failed!\n");
- return -EBUSY;
- }
-
- len = 0;
- page_cnt = 0;
-
- for (i = 0; i < sg_dma_len; ++i) {
- unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
- u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
-
- if (dma_addr & ~PAGE_MASK) {
- if (i > 0)
- return -EINVAL;
- else
- ++page_cnt;
- }
- if ((dma_addr + dma_len) & ~PAGE_MASK) {
- if (i < sg_dma_len - 1)
- return -EINVAL;
- else
- ++page_cnt;
- }
-
- len += dma_len;
- }
-
- page_cnt += len >> PAGE_SHIFT;
- if (page_cnt > ibmr->pool->fmr_attr.max_pages)
- return -EINVAL;
-
- dma_pages = kmalloc_node(sizeof(u64) * page_cnt, GFP_ATOMIC,
- rdsibdev_to_node(rds_ibdev));
- if (!dma_pages)
- return -ENOMEM;
-
- page_cnt = 0;
- for (i = 0; i < sg_dma_len; ++i) {
- unsigned int dma_len = ib_sg_dma_len(dev, &scat[i]);
- u64 dma_addr = ib_sg_dma_address(dev, &scat[i]);
-
- for (j = 0; j < dma_len; j += PAGE_SIZE)
- dma_pages[page_cnt++] =
- (dma_addr & PAGE_MASK) + j;
- }
-
- ret = ib_map_phys_fmr(ibmr->fmr,
- dma_pages, page_cnt, io_addr);
- if (ret)
- goto out;
-
- /* Success - we successfully remapped the MR, so we can
- * safely tear down the old mapping. */
- rds_ib_teardown_mr(ibmr);
-
- ibmr->sg = scat;
- ibmr->sg_len = nents;
- ibmr->sg_dma_len = sg_dma_len;
- ibmr->remap_count++;
-
- if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_used);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_used);
- ret = 0;
-
-out:
- kfree(dma_pages);
-
- return ret;
-}
-
void rds_ib_sync_mr(void *trans_private, int direction)
{
struct rds_ib_mr *ibmr = trans_private;
@@ -518,7 +230,7 @@ void rds_ib_sync_mr(void *trans_private, int direction)
}
}
-static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
{
struct rds_ib_device *rds_ibdev = ibmr->device;
@@ -549,7 +261,7 @@ static void __rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
}
}
-static void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
+void rds_ib_teardown_mr(struct rds_ib_mr *ibmr)
{
unsigned int pinned = ibmr->sg_len;
@@ -623,17 +335,15 @@ static void list_to_llist_nodes(struct rds_ib_mr_pool *pool,
* If the number of MRs allocated exceeds the limit, we also try
* to free as many MRs as needed to get back to this limit.
*/
-static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
- int free_all, struct rds_ib_mr **ibmr_ret)
+int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
+ int free_all, struct rds_ib_mr **ibmr_ret)
{
- struct rds_ib_mr *ibmr, *next;
+ struct rds_ib_mr *ibmr;
struct llist_node *clean_nodes;
struct llist_node *clean_tail;
LIST_HEAD(unmap_list);
- LIST_HEAD(fmr_list);
unsigned long unpinned = 0;
unsigned int nfreed = 0, dirty_to_clean = 0, free_goal;
- int ret = 0;
if (pool->pool_type == RDS_IB_MR_8K_POOL)
rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_flush);
@@ -643,7 +353,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
if (ibmr_ret) {
DEFINE_WAIT(wait);
while (!mutex_trylock(&pool->flush_lock)) {
- ibmr = rds_ib_reuse_fmr(pool);
+ ibmr = rds_ib_reuse_mr(pool);
if (ibmr) {
*ibmr_ret = ibmr;
finish_wait(&pool->flush_wait, &wait);
@@ -655,7 +365,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
if (llist_empty(&pool->clean_list))
schedule();
- ibmr = rds_ib_reuse_fmr(pool);
+ ibmr = rds_ib_reuse_mr(pool);
if (ibmr) {
*ibmr_ret = ibmr;
finish_wait(&pool->flush_wait, &wait);
@@ -667,7 +377,7 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
mutex_lock(&pool->flush_lock);
if (ibmr_ret) {
- ibmr = rds_ib_reuse_fmr(pool);
+ ibmr = rds_ib_reuse_mr(pool);
if (ibmr) {
*ibmr_ret = ibmr;
goto out;
@@ -687,30 +397,10 @@ static int rds_ib_flush_mr_pool(struct rds_ib_mr_pool *pool,
if (list_empty(&unmap_list))
goto out;
- /* String all ib_mr's onto one list and hand them to ib_unmap_fmr */
- list_for_each_entry(ibmr, &unmap_list, unmap_list)
- list_add(&ibmr->fmr->list, &fmr_list);
-
- ret = ib_unmap_fmr(&fmr_list);
- if (ret)
- printk(KERN_WARNING "RDS/IB: ib_unmap_fmr failed (err=%d)\n", ret);
-
- /* Now we can destroy the DMA mapping and unpin any pages */
- list_for_each_entry_safe(ibmr, next, &unmap_list, unmap_list) {
- unpinned += ibmr->sg_len;
- __rds_ib_teardown_mr(ibmr);
- if (nfreed < free_goal ||
- ibmr->remap_count >= pool->fmr_attr.max_maps) {
- if (ibmr->pool->pool_type == RDS_IB_MR_8K_POOL)
- rds_ib_stats_inc(s_ib_rdma_mr_8k_free);
- else
- rds_ib_stats_inc(s_ib_rdma_mr_1m_free);
- list_del(&ibmr->unmap_list);
- ib_dealloc_fmr(ibmr->fmr);
- kfree(ibmr);
- nfreed++;
- }
- }
+ if (pool->use_fastreg)
+ rds_ib_unreg_frmr(&unmap_list, &nfreed, &unpinned, free_goal);
+ else
+ rds_ib_unreg_fmr(&unmap_list, &nfreed, &unpinned, free_goal);
if (!list_empty(&unmap_list)) {
/* we have to make sure that none of the things we're about
@@ -743,7 +433,47 @@ out:
if (waitqueue_active(&pool->flush_wait))
wake_up(&pool->flush_wait);
out_nolock:
- return ret;
+ return 0;
+}
+
+struct rds_ib_mr *rds_ib_try_reuse_ibmr(struct rds_ib_mr_pool *pool)
+{
+ struct rds_ib_mr *ibmr = NULL;
+ int iter = 0;
+
+ if (atomic_read(&pool->dirty_count) >= pool->max_items_soft / 10)
+ queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
+
+ while (1) {
+ ibmr = rds_ib_reuse_mr(pool);
+ if (ibmr)
+ return ibmr;
+
+ if (atomic_inc_return(&pool->item_count) <= pool->max_items)
+ break;
+
+ atomic_dec(&pool->item_count);
+
+ if (++iter > 2) {
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_depleted);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_depleted);
+ return ERR_PTR(-EAGAIN);
+ }
+
+ /* We do have some empty MRs. Flush them out. */
+ if (pool->pool_type == RDS_IB_MR_8K_POOL)
+ rds_ib_stats_inc(s_ib_rdma_mr_8k_pool_wait);
+ else
+ rds_ib_stats_inc(s_ib_rdma_mr_1m_pool_wait);
+
+ rds_ib_flush_mr_pool(pool, 0, &ibmr);
+ if (ibmr)
+ return ibmr;
+ }
+
+ return ibmr;
}
static void rds_ib_mr_pool_flush_worker(struct work_struct *work)
@@ -762,10 +492,10 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
rdsdebug("RDS/IB: free_mr nents %u\n", ibmr->sg_len);
/* Return it to the pool's free list */
- if (ibmr->remap_count >= pool->fmr_attr.max_maps)
- llist_add(&ibmr->llnode, &pool->drop_list);
+ if (rds_ibdev->use_fastreg)
+ rds_ib_free_frmr_list(ibmr);
else
- llist_add(&ibmr->llnode, &pool->free_list);
+ rds_ib_free_fmr_list(ibmr);
atomic_add(ibmr->sg_len, &pool->free_pinned);
atomic_inc(&pool->dirty_count);
@@ -773,7 +503,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
/* If we've pinned too many pages, request a flush */
if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
atomic_read(&pool->dirty_count) >= pool->max_items / 5)
- queue_delayed_work(rds_ib_fmr_wq, &pool->flush_worker, 10);
+ queue_delayed_work(rds_ib_mr_wq, &pool->flush_worker, 10);
if (invalidate) {
if (likely(!in_interrupt())) {
@@ -782,7 +512,7 @@ void rds_ib_free_mr(void *trans_private, int invalidate)
/* We get here if the user created a MR marked
* as use_once and invalidate at the same time.
*/
- queue_delayed_work(rds_ib_fmr_wq,
+ queue_delayed_work(rds_ib_mr_wq,
&pool->flush_worker, 10);
}
}
@@ -810,6 +540,7 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
{
struct rds_ib_device *rds_ibdev;
struct rds_ib_mr *ibmr = NULL;
+ struct rds_ib_connection *ic = rs->rs_conn->c_transport_data;
int ret;
rds_ibdev = rds_ib_get_device(rs->rs_bound_addr);
@@ -823,29 +554,81 @@ void *rds_ib_get_mr(struct scatterlist *sg, unsigned long nents,
goto out;
}
- ibmr = rds_ib_alloc_fmr(rds_ibdev, nents);
- if (IS_ERR(ibmr)) {
- rds_ib_dev_put(rds_ibdev);
- return ibmr;
- }
-
- ret = rds_ib_map_fmr(rds_ibdev, ibmr, sg, nents);
- if (ret == 0)
- *key_ret = ibmr->fmr->rkey;
+ if (rds_ibdev->use_fastreg)
+ ibmr = rds_ib_reg_frmr(rds_ibdev, ic, sg, nents, key_ret);
else
- printk(KERN_WARNING "RDS/IB: map_fmr failed (errno=%d)\n", ret);
-
- ibmr->device = rds_ibdev;
- rds_ibdev = NULL;
+ ibmr = rds_ib_reg_fmr(rds_ibdev, sg, nents, key_ret);
+ if (ibmr)
+ rds_ibdev = NULL;
out:
- if (ret) {
- if (ibmr)
- rds_ib_free_mr(ibmr, 0);
- ibmr = ERR_PTR(ret);
- }
+ if (!ibmr)
+ pr_warn("RDS/IB: rds_ib_get_mr failed (errno=%d)\n", ret);
+
if (rds_ibdev)
rds_ib_dev_put(rds_ibdev);
+
return ibmr;
}
+void rds_ib_destroy_mr_pool(struct rds_ib_mr_pool *pool)
+{
+ cancel_delayed_work_sync(&pool->flush_worker);
+ rds_ib_flush_mr_pool(pool, 1, NULL);
+ WARN_ON(atomic_read(&pool->item_count));
+ WARN_ON(atomic_read(&pool->free_pinned));
+ kfree(pool);
+}
+
+struct rds_ib_mr_pool *rds_ib_create_mr_pool(struct rds_ib_device *rds_ibdev,
+ int pool_type)
+{
+ struct rds_ib_mr_pool *pool;
+
+ pool = kzalloc(sizeof(*pool), GFP_KERNEL);
+ if (!pool)
+ return ERR_PTR(-ENOMEM);
+
+ pool->pool_type = pool_type;
+ init_llist_head(&pool->free_list);
+ init_llist_head(&pool->drop_list);
+ init_llist_head(&pool->clean_list);
+ mutex_init(&pool->flush_lock);
+ init_waitqueue_head(&pool->flush_wait);
+ INIT_DELAYED_WORK(&pool->flush_worker, rds_ib_mr_pool_flush_worker);
+
+ if (pool_type == RDS_IB_MR_1M_POOL) {
+ /* +1 allows for unaligned MRs */
+ pool->fmr_attr.max_pages = RDS_MR_1M_MSG_SIZE + 1;
+ pool->max_items = RDS_MR_1M_POOL_SIZE;
+ } else {
+ /* pool_type == RDS_IB_MR_8K_POOL */
+ pool->fmr_attr.max_pages = RDS_MR_8K_MSG_SIZE + 1;
+ pool->max_items = RDS_MR_8K_POOL_SIZE;
+ }
+
+ pool->max_free_pinned = pool->max_items * pool->fmr_attr.max_pages / 4;
+ pool->fmr_attr.max_maps = rds_ibdev->fmr_max_remaps;
+ pool->fmr_attr.page_shift = PAGE_SHIFT;
+ pool->max_items_soft = rds_ibdev->max_mrs * 3 / 4;
+ pool->use_fastreg = rds_ibdev->use_fastreg;
+
+ return pool;
+}
+
+int rds_ib_mr_init(void)
+{
+ rds_ib_mr_wq = create_workqueue("rds_mr_flushd");
+ if (!rds_ib_mr_wq)
+ return -ENOMEM;
+ return 0;
+}
+
+/* By the time this is called all the IB devices should have been torn down and
+ * had their pools freed. As each pool is freed its work struct is waited on,
+ * so the pool flushing work queue should be idle by the time we get here.
+ */
+void rds_ib_mr_exit(void)
+{
+ destroy_workqueue(rds_ib_mr_wq);
+}
diff --git a/net/rds/ib_recv.c b/net/rds/ib_recv.c
index 96744b75db93..abc8cc805e8d 100644
--- a/net/rds/ib_recv.c
+++ b/net/rds/ib_recv.c
@@ -305,7 +305,7 @@ static int rds_ib_recv_refill_one(struct rds_connection *conn,
gfp_t slab_mask = GFP_NOWAIT;
gfp_t page_mask = GFP_NOWAIT;
- if (gfp & __GFP_WAIT) {
+ if (gfp & __GFP_DIRECT_RECLAIM) {
slab_mask = GFP_KERNEL;
page_mask = GFP_HIGHUSER;
}
@@ -379,7 +379,7 @@ void rds_ib_recv_refill(struct rds_connection *conn, int prefill, gfp_t gfp)
struct ib_recv_wr *failed_wr;
unsigned int posted = 0;
int ret = 0;
- bool can_wait = !!(gfp & __GFP_WAIT);
+ bool can_wait = !!(gfp & __GFP_DIRECT_RECLAIM);
u32 pos;
/* the goal here is to just make sure that someone, somewhere
@@ -796,7 +796,7 @@ static void rds_ib_cong_recv(struct rds_connection *conn,
addr = kmap_atomic(sg_page(&frag->f_sg));
- src = addr + frag_off;
+ src = addr + frag->f_sg.offset + frag_off;
dst = (void *)map->m_page_addrs[map_page] + map_off;
for (k = 0; k < to_copy; k += 8) {
/* Record ports that became uncongested, ie
diff --git a/net/rds/ib_send.c b/net/rds/ib_send.c
index 670882c752e9..f27d2c82b036 100644
--- a/net/rds/ib_send.c
+++ b/net/rds/ib_send.c
@@ -195,7 +195,7 @@ void rds_ib_send_init_ring(struct rds_ib_connection *ic)
send->s_op = NULL;
- send->s_wr.wr_id = i | RDS_IB_SEND_OP;
+ send->s_wr.wr_id = i;
send->s_wr.sg_list = send->s_sge;
send->s_wr.ex.imm_data = 0;
@@ -263,9 +263,7 @@ void rds_ib_send_cqe_handler(struct rds_ib_connection *ic, struct ib_wc *wc)
oldest = rds_ib_ring_oldest(&ic->i_send_ring);
- completed = rds_ib_ring_completed(&ic->i_send_ring,
- (wc->wr_id & ~RDS_IB_SEND_OP),
- oldest);
+ completed = rds_ib_ring_completed(&ic->i_send_ring, wc->wr_id, oldest);
for (i = 0; i < completed; i++) {
send = &ic->i_sends[oldest];
@@ -777,23 +775,23 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
send->s_queued = jiffies;
if (op->op_type == RDS_ATOMIC_TYPE_CSWP) {
- send->s_wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
- send->s_wr.wr.atomic.compare_add = op->op_m_cswp.compare;
- send->s_wr.wr.atomic.swap = op->op_m_cswp.swap;
- send->s_wr.wr.atomic.compare_add_mask = op->op_m_cswp.compare_mask;
- send->s_wr.wr.atomic.swap_mask = op->op_m_cswp.swap_mask;
+ send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_CMP_AND_SWP;
+ send->s_atomic_wr.compare_add = op->op_m_cswp.compare;
+ send->s_atomic_wr.swap = op->op_m_cswp.swap;
+ send->s_atomic_wr.compare_add_mask = op->op_m_cswp.compare_mask;
+ send->s_atomic_wr.swap_mask = op->op_m_cswp.swap_mask;
} else { /* FADD */
- send->s_wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
- send->s_wr.wr.atomic.compare_add = op->op_m_fadd.add;
- send->s_wr.wr.atomic.swap = 0;
- send->s_wr.wr.atomic.compare_add_mask = op->op_m_fadd.nocarry_mask;
- send->s_wr.wr.atomic.swap_mask = 0;
+ send->s_atomic_wr.wr.opcode = IB_WR_MASKED_ATOMIC_FETCH_AND_ADD;
+ send->s_atomic_wr.compare_add = op->op_m_fadd.add;
+ send->s_atomic_wr.swap = 0;
+ send->s_atomic_wr.compare_add_mask = op->op_m_fadd.nocarry_mask;
+ send->s_atomic_wr.swap_mask = 0;
}
nr_sig = rds_ib_set_wr_signal_state(ic, send, op->op_notify);
- send->s_wr.num_sge = 1;
- send->s_wr.next = NULL;
- send->s_wr.wr.atomic.remote_addr = op->op_remote_addr;
- send->s_wr.wr.atomic.rkey = op->op_rkey;
+ send->s_atomic_wr.wr.num_sge = 1;
+ send->s_atomic_wr.wr.next = NULL;
+ send->s_atomic_wr.remote_addr = op->op_remote_addr;
+ send->s_atomic_wr.rkey = op->op_rkey;
send->s_op = op;
rds_message_addref(container_of(send->s_op, struct rds_message, atomic));
@@ -818,11 +816,11 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
if (nr_sig)
atomic_add(nr_sig, &ic->i_signaled_sends);
- failed_wr = &send->s_wr;
- ret = ib_post_send(ic->i_cm_id->qp, &send->s_wr, &failed_wr);
+ failed_wr = &send->s_atomic_wr.wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &send->s_atomic_wr.wr, &failed_wr);
rdsdebug("ic %p send %p (wr %p) ret %d wr %p\n", ic,
- send, &send->s_wr, ret, failed_wr);
- BUG_ON(failed_wr != &send->s_wr);
+ send, &send->s_atomic_wr, ret, failed_wr);
+ BUG_ON(failed_wr != &send->s_atomic_wr.wr);
if (ret) {
printk(KERN_WARNING "RDS/IB: atomic ib_post_send to %pI4 "
"returned %d\n", &conn->c_faddr, ret);
@@ -831,9 +829,9 @@ int rds_ib_xmit_atomic(struct rds_connection *conn, struct rm_atomic_op *op)
goto out;
}
- if (unlikely(failed_wr != &send->s_wr)) {
+ if (unlikely(failed_wr != &send->s_atomic_wr.wr)) {
printk(KERN_WARNING "RDS/IB: atomic ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
- BUG_ON(failed_wr != &send->s_wr);
+ BUG_ON(failed_wr != &send->s_atomic_wr.wr);
}
out:
@@ -904,22 +902,23 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
nr_sig += rds_ib_set_wr_signal_state(ic, send, op->op_notify);
send->s_wr.opcode = op->op_write ? IB_WR_RDMA_WRITE : IB_WR_RDMA_READ;
- send->s_wr.wr.rdma.remote_addr = remote_addr;
- send->s_wr.wr.rdma.rkey = op->op_rkey;
+ send->s_rdma_wr.remote_addr = remote_addr;
+ send->s_rdma_wr.rkey = op->op_rkey;
if (num_sge > max_sge) {
- send->s_wr.num_sge = max_sge;
+ send->s_rdma_wr.wr.num_sge = max_sge;
num_sge -= max_sge;
} else {
- send->s_wr.num_sge = num_sge;
+ send->s_rdma_wr.wr.num_sge = num_sge;
}
- send->s_wr.next = NULL;
+ send->s_rdma_wr.wr.next = NULL;
if (prev)
- prev->s_wr.next = &send->s_wr;
+ prev->s_rdma_wr.wr.next = &send->s_rdma_wr.wr;
- for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
+ for (j = 0; j < send->s_rdma_wr.wr.num_sge &&
+ scat != &op->op_sg[op->op_count]; j++) {
len = ib_sg_dma_len(ic->i_cm_id->device, scat);
send->s_sge[j].addr =
ib_sg_dma_address(ic->i_cm_id->device, scat);
@@ -934,7 +933,9 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
}
rdsdebug("send %p wr %p num_sge %u next %p\n", send,
- &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
+ &send->s_rdma_wr.wr,
+ send->s_rdma_wr.wr.num_sge,
+ send->s_rdma_wr.wr.next);
prev = send;
if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
@@ -955,11 +956,11 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
if (nr_sig)
atomic_add(nr_sig, &ic->i_signaled_sends);
- failed_wr = &first->s_wr;
- ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
+ failed_wr = &first->s_rdma_wr.wr;
+ ret = ib_post_send(ic->i_cm_id->qp, &first->s_rdma_wr.wr, &failed_wr);
rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
- first, &first->s_wr, ret, failed_wr);
- BUG_ON(failed_wr != &first->s_wr);
+ first, &first->s_rdma_wr.wr, ret, failed_wr);
+ BUG_ON(failed_wr != &first->s_rdma_wr.wr);
if (ret) {
printk(KERN_WARNING "RDS/IB: rdma ib_post_send to %pI4 "
"returned %d\n", &conn->c_faddr, ret);
@@ -968,9 +969,9 @@ int rds_ib_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
goto out;
}
- if (unlikely(failed_wr != &first->s_wr)) {
+ if (unlikely(failed_wr != &first->s_rdma_wr.wr)) {
printk(KERN_WARNING "RDS/IB: ib_post_send() rc=%d, but failed_wqe updated!\n", ret);
- BUG_ON(failed_wr != &first->s_wr);
+ BUG_ON(failed_wr != &first->s_rdma_wr.wr);
}
diff --git a/net/rds/ib_stats.c b/net/rds/ib_stats.c
index d77e04473056..7e78dca1f252 100644
--- a/net/rds/ib_stats.c
+++ b/net/rds/ib_stats.c
@@ -73,6 +73,8 @@ static const char *const rds_ib_stat_names[] = {
"ib_rdma_mr_1m_pool_flush",
"ib_rdma_mr_1m_pool_wait",
"ib_rdma_mr_1m_pool_depleted",
+ "ib_rdma_mr_8k_reused",
+ "ib_rdma_mr_1m_reused",
"ib_atomic_cswp",
"ib_atomic_fadd",
};
diff --git a/net/rds/iw.c b/net/rds/iw.c
deleted file mode 100644
index 3df0295c6659..000000000000
--- a/net/rds/iw.c
+++ /dev/null
@@ -1,325 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/in.h>
-#include <linux/if.h>
-#include <linux/netdevice.h>
-#include <linux/inetdevice.h>
-#include <linux/if_arp.h>
-#include <linux/delay.h>
-#include <linux/slab.h>
-#include <linux/module.h>
-
-#include "rds.h"
-#include "iw.h"
-
-unsigned int fastreg_pool_size = RDS_FASTREG_POOL_SIZE;
-unsigned int fastreg_message_size = RDS_FASTREG_SIZE + 1; /* +1 allows for unaligned MRs */
-
-module_param(fastreg_pool_size, int, 0444);
-MODULE_PARM_DESC(fastreg_pool_size, " Max number of fastreg MRs per device");
-module_param(fastreg_message_size, int, 0444);
-MODULE_PARM_DESC(fastreg_message_size, " Max size of a RDMA transfer (fastreg MRs)");
-
-struct list_head rds_iw_devices;
-
-/* NOTE: if also grabbing iwdev lock, grab this first */
-DEFINE_SPINLOCK(iw_nodev_conns_lock);
-LIST_HEAD(iw_nodev_conns);
-
-static void rds_iw_add_one(struct ib_device *device)
-{
- struct rds_iw_device *rds_iwdev;
- struct ib_device_attr *dev_attr;
-
- /* Only handle iwarp devices */
- if (device->node_type != RDMA_NODE_RNIC)
- return;
-
- dev_attr = kmalloc(sizeof *dev_attr, GFP_KERNEL);
- if (!dev_attr)
- return;
-
- if (ib_query_device(device, dev_attr)) {
- rdsdebug("Query device failed for %s\n", device->name);
- goto free_attr;
- }
-
- rds_iwdev = kmalloc(sizeof *rds_iwdev, GFP_KERNEL);
- if (!rds_iwdev)
- goto free_attr;
-
- spin_lock_init(&rds_iwdev->spinlock);
-
- rds_iwdev->dma_local_lkey = !!(dev_attr->device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY);
- rds_iwdev->max_wrs = dev_attr->max_qp_wr;
- rds_iwdev->max_sge = min(dev_attr->max_sge, RDS_IW_MAX_SGE);
-
- rds_iwdev->dev = device;
- rds_iwdev->pd = ib_alloc_pd(device);
- if (IS_ERR(rds_iwdev->pd))
- goto free_dev;
-
- if (!rds_iwdev->dma_local_lkey) {
- rds_iwdev->mr = ib_get_dma_mr(rds_iwdev->pd,
- IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_WRITE |
- IB_ACCESS_LOCAL_WRITE);
- if (IS_ERR(rds_iwdev->mr))
- goto err_pd;
- } else
- rds_iwdev->mr = NULL;
-
- rds_iwdev->mr_pool = rds_iw_create_mr_pool(rds_iwdev);
- if (IS_ERR(rds_iwdev->mr_pool)) {
- rds_iwdev->mr_pool = NULL;
- goto err_mr;
- }
-
- INIT_LIST_HEAD(&rds_iwdev->cm_id_list);
- INIT_LIST_HEAD(&rds_iwdev->conn_list);
- list_add_tail(&rds_iwdev->list, &rds_iw_devices);
-
- ib_set_client_data(device, &rds_iw_client, rds_iwdev);
-
- goto free_attr;
-
-err_mr:
- if (rds_iwdev->mr)
- ib_dereg_mr(rds_iwdev->mr);
-err_pd:
- ib_dealloc_pd(rds_iwdev->pd);
-free_dev:
- kfree(rds_iwdev);
-free_attr:
- kfree(dev_attr);
-}
-
-static void rds_iw_remove_one(struct ib_device *device, void *client_data)
-{
- struct rds_iw_device *rds_iwdev = client_data;
- struct rds_iw_cm_id *i_cm_id, *next;
-
- if (!rds_iwdev)
- return;
-
- spin_lock_irq(&rds_iwdev->spinlock);
- list_for_each_entry_safe(i_cm_id, next, &rds_iwdev->cm_id_list, list) {
- list_del(&i_cm_id->list);
- kfree(i_cm_id);
- }
- spin_unlock_irq(&rds_iwdev->spinlock);
-
- rds_iw_destroy_conns(rds_iwdev);
-
- if (rds_iwdev->mr_pool)
- rds_iw_destroy_mr_pool(rds_iwdev->mr_pool);
-
- if (rds_iwdev->mr)
- ib_dereg_mr(rds_iwdev->mr);
-
- ib_dealloc_pd(rds_iwdev->pd);
-
- list_del(&rds_iwdev->list);
- kfree(rds_iwdev);
-}
-
-struct ib_client rds_iw_client = {
- .name = "rds_iw",
- .add = rds_iw_add_one,
- .remove = rds_iw_remove_one
-};
-
-static int rds_iw_conn_info_visitor(struct rds_connection *conn,
- void *buffer)
-{
- struct rds_info_rdma_connection *iinfo = buffer;
- struct rds_iw_connection *ic;
-
- /* We will only ever look at IB transports */
- if (conn->c_trans != &rds_iw_transport)
- return 0;
-
- iinfo->src_addr = conn->c_laddr;
- iinfo->dst_addr = conn->c_faddr;
-
- memset(&iinfo->src_gid, 0, sizeof(iinfo->src_gid));
- memset(&iinfo->dst_gid, 0, sizeof(iinfo->dst_gid));
- if (rds_conn_state(conn) == RDS_CONN_UP) {
- struct rds_iw_device *rds_iwdev;
- struct rdma_dev_addr *dev_addr;
-
- ic = conn->c_transport_data;
- dev_addr = &ic->i_cm_id->route.addr.dev_addr;
-
- rdma_addr_get_sgid(dev_addr, (union ib_gid *) &iinfo->src_gid);
- rdma_addr_get_dgid(dev_addr, (union ib_gid *) &iinfo->dst_gid);
-
- rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
- iinfo->max_send_wr = ic->i_send_ring.w_nr;
- iinfo->max_recv_wr = ic->i_recv_ring.w_nr;
- iinfo->max_send_sge = rds_iwdev->max_sge;
- rds_iw_get_mr_info(rds_iwdev, iinfo);
- }
- return 1;
-}
-
-static void rds_iw_ic_info(struct socket *sock, unsigned int len,
- struct rds_info_iterator *iter,
- struct rds_info_lengths *lens)
-{
- rds_for_each_conn_info(sock, len, iter, lens,
- rds_iw_conn_info_visitor,
- sizeof(struct rds_info_rdma_connection));
-}
-
-
-/*
- * Early RDS/IB was built to only bind to an address if there is an IPoIB
- * device with that address set.
- *
- * If it were me, I'd advocate for something more flexible. Sending and
- * receiving should be device-agnostic. Transports would try and maintain
- * connections between peers who have messages queued. Userspace would be
- * allowed to influence which paths have priority. We could call userspace
- * asserting this policy "routing".
- */
-static int rds_iw_laddr_check(struct net *net, __be32 addr)
-{
- int ret;
- struct rdma_cm_id *cm_id;
- struct sockaddr_in sin;
-
- /* Create a CMA ID and try to bind it. This catches both
- * IB and iWARP capable NICs.
- */
- cm_id = rdma_create_id(NULL, NULL, RDMA_PS_TCP, IB_QPT_RC);
- if (IS_ERR(cm_id))
- return PTR_ERR(cm_id);
-
- memset(&sin, 0, sizeof(sin));
- sin.sin_family = AF_INET;
- sin.sin_addr.s_addr = addr;
-
- /* rdma_bind_addr will only succeed for IB & iWARP devices */
- ret = rdma_bind_addr(cm_id, (struct sockaddr *)&sin);
- /* due to this, we will claim to support IB devices unless we
- check node_type. */
- if (ret || !cm_id->device ||
- cm_id->device->node_type != RDMA_NODE_RNIC)
- ret = -EADDRNOTAVAIL;
-
- rdsdebug("addr %pI4 ret %d node type %d\n",
- &addr, ret,
- cm_id->device ? cm_id->device->node_type : -1);
-
- rdma_destroy_id(cm_id);
-
- return ret;
-}
-
-void rds_iw_exit(void)
-{
- rds_info_deregister_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
- rds_iw_destroy_nodev_conns();
- ib_unregister_client(&rds_iw_client);
- rds_iw_sysctl_exit();
- rds_iw_recv_exit();
- rds_trans_unregister(&rds_iw_transport);
-}
-
-struct rds_transport rds_iw_transport = {
- .laddr_check = rds_iw_laddr_check,
- .xmit_complete = rds_iw_xmit_complete,
- .xmit = rds_iw_xmit,
- .xmit_rdma = rds_iw_xmit_rdma,
- .recv = rds_iw_recv,
- .conn_alloc = rds_iw_conn_alloc,
- .conn_free = rds_iw_conn_free,
- .conn_connect = rds_iw_conn_connect,
- .conn_shutdown = rds_iw_conn_shutdown,
- .inc_copy_to_user = rds_iw_inc_copy_to_user,
- .inc_free = rds_iw_inc_free,
- .cm_initiate_connect = rds_iw_cm_initiate_connect,
- .cm_handle_connect = rds_iw_cm_handle_connect,
- .cm_connect_complete = rds_iw_cm_connect_complete,
- .stats_info_copy = rds_iw_stats_info_copy,
- .exit = rds_iw_exit,
- .get_mr = rds_iw_get_mr,
- .sync_mr = rds_iw_sync_mr,
- .free_mr = rds_iw_free_mr,
- .flush_mrs = rds_iw_flush_mrs,
- .t_owner = THIS_MODULE,
- .t_name = "iwarp",
- .t_type = RDS_TRANS_IWARP,
- .t_prefer_loopback = 1,
-};
-
-int rds_iw_init(void)
-{
- int ret;
-
- INIT_LIST_HEAD(&rds_iw_devices);
-
- ret = ib_register_client(&rds_iw_client);
- if (ret)
- goto out;
-
- ret = rds_iw_sysctl_init();
- if (ret)
- goto out_ibreg;
-
- ret = rds_iw_recv_init();
- if (ret)
- goto out_sysctl;
-
- ret = rds_trans_register(&rds_iw_transport);
- if (ret)
- goto out_recv;
-
- rds_info_register_func(RDS_INFO_IWARP_CONNECTIONS, rds_iw_ic_info);
-
- goto out;
-
-out_recv:
- rds_iw_recv_exit();
-out_sysctl:
- rds_iw_sysctl_exit();
-out_ibreg:
- ib_unregister_client(&rds_iw_client);
-out:
- return ret;
-}
-
-MODULE_LICENSE("GPL");
-
diff --git a/net/rds/iw.h b/net/rds/iw.h
deleted file mode 100644
index cbe6674e31ee..000000000000
--- a/net/rds/iw.h
+++ /dev/null
@@ -1,395 +0,0 @@
-#ifndef _RDS_IW_H
-#define _RDS_IW_H
-
-#include <linux/interrupt.h>
-#include <rdma/ib_verbs.h>
-#include <rdma/rdma_cm.h>
-#include "rds.h"
-#include "rdma_transport.h"
-
-#define RDS_FASTREG_SIZE 20
-#define RDS_FASTREG_POOL_SIZE 2048
-
-#define RDS_IW_MAX_SGE 8
-#define RDS_IW_RECV_SGE 2
-
-#define RDS_IW_DEFAULT_RECV_WR 1024
-#define RDS_IW_DEFAULT_SEND_WR 256
-
-#define RDS_IW_SUPPORTED_PROTOCOLS 0x00000003 /* minor versions supported */
-
-extern struct list_head rds_iw_devices;
-
-/*
- * IB posts RDS_FRAG_SIZE fragments of pages to the receive queues to
- * try and minimize the amount of memory tied up both the device and
- * socket receive queues.
- */
-/* page offset of the final full frag that fits in the page */
-#define RDS_PAGE_LAST_OFF (((PAGE_SIZE / RDS_FRAG_SIZE) - 1) * RDS_FRAG_SIZE)
-struct rds_page_frag {
- struct list_head f_item;
- struct page *f_page;
- unsigned long f_offset;
- dma_addr_t f_mapped;
-};
-
-struct rds_iw_incoming {
- struct list_head ii_frags;
- struct rds_incoming ii_inc;
-};
-
-struct rds_iw_connect_private {
- /* Add new fields at the end, and don't permute existing fields. */
- __be32 dp_saddr;
- __be32 dp_daddr;
- u8 dp_protocol_major;
- u8 dp_protocol_minor;
- __be16 dp_protocol_minor_mask; /* bitmask */
- __be32 dp_reserved1;
- __be64 dp_ack_seq;
- __be32 dp_credit; /* non-zero enables flow ctl */
-};
-
-struct rds_iw_scatterlist {
- struct scatterlist *list;
- unsigned int len;
- int dma_len;
- unsigned int dma_npages;
- unsigned int bytes;
-};
-
-struct rds_iw_mapping {
- spinlock_t m_lock; /* protect the mapping struct */
- struct list_head m_list;
- struct rds_iw_mr *m_mr;
- uint32_t m_rkey;
- struct rds_iw_scatterlist m_sg;
-};
-
-struct rds_iw_send_work {
- struct rds_message *s_rm;
-
- /* We should really put these into a union: */
- struct rm_rdma_op *s_op;
- struct rds_iw_mapping *s_mapping;
- struct ib_mr *s_mr;
- struct ib_fast_reg_page_list *s_page_list;
- unsigned char s_remap_count;
-
- struct ib_send_wr s_wr;
- struct ib_sge s_sge[RDS_IW_MAX_SGE];
- unsigned long s_queued;
-};
-
-struct rds_iw_recv_work {
- struct rds_iw_incoming *r_iwinc;
- struct rds_page_frag *r_frag;
- struct ib_recv_wr r_wr;
- struct ib_sge r_sge[2];
-};
-
-struct rds_iw_work_ring {
- u32 w_nr;
- u32 w_alloc_ptr;
- u32 w_alloc_ctr;
- u32 w_free_ptr;
- atomic_t w_free_ctr;
-};
-
-struct rds_iw_device;
-
-struct rds_iw_connection {
-
- struct list_head iw_node;
- struct rds_iw_device *rds_iwdev;
- struct rds_connection *conn;
-
- /* alphabet soup, IBTA style */
- struct rdma_cm_id *i_cm_id;
- struct ib_pd *i_pd;
- struct ib_mr *i_mr;
- struct ib_cq *i_send_cq;
- struct ib_cq *i_recv_cq;
-
- /* tx */
- struct rds_iw_work_ring i_send_ring;
- struct rds_message *i_rm;
- struct rds_header *i_send_hdrs;
- u64 i_send_hdrs_dma;
- struct rds_iw_send_work *i_sends;
-
- /* rx */
- struct tasklet_struct i_recv_tasklet;
- struct mutex i_recv_mutex;
- struct rds_iw_work_ring i_recv_ring;
- struct rds_iw_incoming *i_iwinc;
- u32 i_recv_data_rem;
- struct rds_header *i_recv_hdrs;
- u64 i_recv_hdrs_dma;
- struct rds_iw_recv_work *i_recvs;
- struct rds_page_frag i_frag;
- u64 i_ack_recv; /* last ACK received */
-
- /* sending acks */
- unsigned long i_ack_flags;
-#ifdef KERNEL_HAS_ATOMIC64
- atomic64_t i_ack_next; /* next ACK to send */
-#else
- spinlock_t i_ack_lock; /* protect i_ack_next */
- u64 i_ack_next; /* next ACK to send */
-#endif
- struct rds_header *i_ack;
- struct ib_send_wr i_ack_wr;
- struct ib_sge i_ack_sge;
- u64 i_ack_dma;
- unsigned long i_ack_queued;
-
- /* Flow control related information
- *
- * Our algorithm uses a pair variables that we need to access
- * atomically - one for the send credits, and one posted
- * recv credits we need to transfer to remote.
- * Rather than protect them using a slow spinlock, we put both into
- * a single atomic_t and update it using cmpxchg
- */
- atomic_t i_credits;
-
- /* Protocol version specific information */
- unsigned int i_flowctl:1; /* enable/disable flow ctl */
- unsigned int i_dma_local_lkey:1;
- unsigned int i_fastreg_posted:1; /* fastreg posted on this connection */
- /* Batched completions */
- unsigned int i_unsignaled_wrs;
- long i_unsignaled_bytes;
-};
-
-/* This assumes that atomic_t is at least 32 bits */
-#define IB_GET_SEND_CREDITS(v) ((v) & 0xffff)
-#define IB_GET_POST_CREDITS(v) ((v) >> 16)
-#define IB_SET_SEND_CREDITS(v) ((v) & 0xffff)
-#define IB_SET_POST_CREDITS(v) ((v) << 16)
-
-struct rds_iw_cm_id {
- struct list_head list;
- struct rdma_cm_id *cm_id;
-};
-
-struct rds_iw_device {
- struct list_head list;
- struct list_head cm_id_list;
- struct list_head conn_list;
- struct ib_device *dev;
- struct ib_pd *pd;
- struct ib_mr *mr;
- struct rds_iw_mr_pool *mr_pool;
- int max_sge;
- unsigned int max_wrs;
- unsigned int dma_local_lkey:1;
- spinlock_t spinlock; /* protect the above */
-};
-
-/* bits for i_ack_flags */
-#define IB_ACK_IN_FLIGHT 0
-#define IB_ACK_REQUESTED 1
-
-/* Magic WR_ID for ACKs */
-#define RDS_IW_ACK_WR_ID ((u64)0xffffffffffffffffULL)
-#define RDS_IW_FAST_REG_WR_ID ((u64)0xefefefefefefefefULL)
-#define RDS_IW_LOCAL_INV_WR_ID ((u64)0xdfdfdfdfdfdfdfdfULL)
-
-struct rds_iw_statistics {
- uint64_t s_iw_connect_raced;
- uint64_t s_iw_listen_closed_stale;
- uint64_t s_iw_tx_cq_call;
- uint64_t s_iw_tx_cq_event;
- uint64_t s_iw_tx_ring_full;
- uint64_t s_iw_tx_throttle;
- uint64_t s_iw_tx_sg_mapping_failure;
- uint64_t s_iw_tx_stalled;
- uint64_t s_iw_tx_credit_updates;
- uint64_t s_iw_rx_cq_call;
- uint64_t s_iw_rx_cq_event;
- uint64_t s_iw_rx_ring_empty;
- uint64_t s_iw_rx_refill_from_cq;
- uint64_t s_iw_rx_refill_from_thread;
- uint64_t s_iw_rx_alloc_limit;
- uint64_t s_iw_rx_credit_updates;
- uint64_t s_iw_ack_sent;
- uint64_t s_iw_ack_send_failure;
- uint64_t s_iw_ack_send_delayed;
- uint64_t s_iw_ack_send_piggybacked;
- uint64_t s_iw_ack_received;
- uint64_t s_iw_rdma_mr_alloc;
- uint64_t s_iw_rdma_mr_free;
- uint64_t s_iw_rdma_mr_used;
- uint64_t s_iw_rdma_mr_pool_flush;
- uint64_t s_iw_rdma_mr_pool_wait;
- uint64_t s_iw_rdma_mr_pool_depleted;
-};
-
-extern struct workqueue_struct *rds_iw_wq;
-
-/*
- * Fake ib_dma_sync_sg_for_{cpu,device} as long as ib_verbs.h
- * doesn't define it.
- */
-static inline void rds_iw_dma_sync_sg_for_cpu(struct ib_device *dev,
- struct scatterlist *sg, unsigned int sg_dma_len, int direction)
-{
- unsigned int i;
-
- for (i = 0; i < sg_dma_len; ++i) {
- ib_dma_sync_single_for_cpu(dev,
- ib_sg_dma_address(dev, &sg[i]),
- ib_sg_dma_len(dev, &sg[i]),
- direction);
- }
-}
-#define ib_dma_sync_sg_for_cpu rds_iw_dma_sync_sg_for_cpu
-
-static inline void rds_iw_dma_sync_sg_for_device(struct ib_device *dev,
- struct scatterlist *sg, unsigned int sg_dma_len, int direction)
-{
- unsigned int i;
-
- for (i = 0; i < sg_dma_len; ++i) {
- ib_dma_sync_single_for_device(dev,
- ib_sg_dma_address(dev, &sg[i]),
- ib_sg_dma_len(dev, &sg[i]),
- direction);
- }
-}
-#define ib_dma_sync_sg_for_device rds_iw_dma_sync_sg_for_device
-
-static inline u32 rds_iw_local_dma_lkey(struct rds_iw_connection *ic)
-{
- return ic->i_dma_local_lkey ? ic->i_cm_id->device->local_dma_lkey : ic->i_mr->lkey;
-}
-
-/* ib.c */
-extern struct rds_transport rds_iw_transport;
-extern struct ib_client rds_iw_client;
-
-extern unsigned int fastreg_pool_size;
-extern unsigned int fastreg_message_size;
-
-extern spinlock_t iw_nodev_conns_lock;
-extern struct list_head iw_nodev_conns;
-
-/* ib_cm.c */
-int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp);
-void rds_iw_conn_free(void *arg);
-int rds_iw_conn_connect(struct rds_connection *conn);
-void rds_iw_conn_shutdown(struct rds_connection *conn);
-void rds_iw_state_change(struct sock *sk);
-int rds_iw_listen_init(void);
-void rds_iw_listen_stop(void);
-void __rds_iw_conn_error(struct rds_connection *conn, const char *, ...);
-int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
- struct rdma_cm_event *event);
-int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id);
-void rds_iw_cm_connect_complete(struct rds_connection *conn,
- struct rdma_cm_event *event);
-
-
-#define rds_iw_conn_error(conn, fmt...) \
- __rds_iw_conn_error(conn, KERN_WARNING "RDS/IW: " fmt)
-
-/* ib_rdma.c */
-int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id);
-void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
-void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn);
-void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock);
-static inline void rds_iw_destroy_nodev_conns(void)
-{
- __rds_iw_destroy_conns(&iw_nodev_conns, &iw_nodev_conns_lock);
-}
-static inline void rds_iw_destroy_conns(struct rds_iw_device *rds_iwdev)
-{
- __rds_iw_destroy_conns(&rds_iwdev->conn_list, &rds_iwdev->spinlock);
-}
-struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *);
-void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo);
-void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *);
-void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
- struct rds_sock *rs, u32 *key_ret);
-void rds_iw_sync_mr(void *trans_private, int dir);
-void rds_iw_free_mr(void *trans_private, int invalidate);
-void rds_iw_flush_mrs(void);
-
-/* ib_recv.c */
-int rds_iw_recv_init(void);
-void rds_iw_recv_exit(void);
-int rds_iw_recv(struct rds_connection *conn);
-int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
- gfp_t page_gfp, int prefill);
-void rds_iw_inc_free(struct rds_incoming *inc);
-int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to);
-void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context);
-void rds_iw_recv_tasklet_fn(unsigned long data);
-void rds_iw_recv_init_ring(struct rds_iw_connection *ic);
-void rds_iw_recv_clear_ring(struct rds_iw_connection *ic);
-void rds_iw_recv_init_ack(struct rds_iw_connection *ic);
-void rds_iw_attempt_ack(struct rds_iw_connection *ic);
-void rds_iw_ack_send_complete(struct rds_iw_connection *ic);
-u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic);
-
-/* ib_ring.c */
-void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr);
-void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr);
-u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos);
-void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val);
-void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val);
-int rds_iw_ring_empty(struct rds_iw_work_ring *ring);
-int rds_iw_ring_low(struct rds_iw_work_ring *ring);
-u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring);
-u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest);
-extern wait_queue_head_t rds_iw_ring_empty_wait;
-
-/* ib_send.c */
-void rds_iw_xmit_complete(struct rds_connection *conn);
-int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
- unsigned int hdr_off, unsigned int sg, unsigned int off);
-void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context);
-void rds_iw_send_init_ring(struct rds_iw_connection *ic);
-void rds_iw_send_clear_ring(struct rds_iw_connection *ic);
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op);
-void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits);
-void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted);
-int rds_iw_send_grab_credits(struct rds_iw_connection *ic, u32 wanted,
- u32 *adv_credits, int need_posted, int max_posted);
-
-/* ib_stats.c */
-DECLARE_PER_CPU(struct rds_iw_statistics, rds_iw_stats);
-#define rds_iw_stats_inc(member) rds_stats_inc_which(rds_iw_stats, member)
-unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
- unsigned int avail);
-
-/* ib_sysctl.c */
-int rds_iw_sysctl_init(void);
-void rds_iw_sysctl_exit(void);
-extern unsigned long rds_iw_sysctl_max_send_wr;
-extern unsigned long rds_iw_sysctl_max_recv_wr;
-extern unsigned long rds_iw_sysctl_max_unsig_wrs;
-extern unsigned long rds_iw_sysctl_max_unsig_bytes;
-extern unsigned long rds_iw_sysctl_max_recv_allocation;
-extern unsigned int rds_iw_sysctl_flow_control;
-
-/*
- * Helper functions for getting/setting the header and data SGEs in
- * RDS packets (not RDMA)
- */
-static inline struct ib_sge *
-rds_iw_header_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
-{
- return &sge[0];
-}
-
-static inline struct ib_sge *
-rds_iw_data_sge(struct rds_iw_connection *ic, struct ib_sge *sge)
-{
- return &sge[1];
-}
-
-#endif
diff --git a/net/rds/iw_cm.c b/net/rds/iw_cm.c
deleted file mode 100644
index a6553a6fb2bc..000000000000
--- a/net/rds/iw_cm.c
+++ /dev/null
@@ -1,769 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/in.h>
-#include <linux/slab.h>
-#include <linux/vmalloc.h>
-#include <linux/ratelimit.h>
-
-#include "rds.h"
-#include "iw.h"
-
-/*
- * Set the selected protocol version
- */
-static void rds_iw_set_protocol(struct rds_connection *conn, unsigned int version)
-{
- conn->c_version = version;
-}
-
-/*
- * Set up flow control
- */
-static void rds_iw_set_flow_control(struct rds_connection *conn, u32 credits)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- if (rds_iw_sysctl_flow_control && credits != 0) {
- /* We're doing flow control */
- ic->i_flowctl = 1;
- rds_iw_send_add_credits(conn, credits);
- } else {
- ic->i_flowctl = 0;
- }
-}
-
-/*
- * Connection established.
- * We get here for both outgoing and incoming connection.
- */
-void rds_iw_cm_connect_complete(struct rds_connection *conn, struct rdma_cm_event *event)
-{
- const struct rds_iw_connect_private *dp = NULL;
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct rds_iw_device *rds_iwdev;
- int err;
-
- if (event->param.conn.private_data_len) {
- dp = event->param.conn.private_data;
-
- rds_iw_set_protocol(conn,
- RDS_PROTOCOL(dp->dp_protocol_major,
- dp->dp_protocol_minor));
- rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
- }
-
- /* update ib_device with this local ipaddr & conn */
- rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
- err = rds_iw_update_cm_id(rds_iwdev, ic->i_cm_id);
- if (err)
- printk(KERN_ERR "rds_iw_update_ipaddr failed (%d)\n", err);
- rds_iw_add_conn(rds_iwdev, conn);
-
- /* If the peer gave us the last packet it saw, process this as if
- * we had received a regular ACK. */
- if (dp && dp->dp_ack_seq)
- rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
-
- printk(KERN_NOTICE "RDS/IW: connected to %pI4<->%pI4 version %u.%u%s\n",
- &conn->c_laddr, &conn->c_faddr,
- RDS_PROTOCOL_MAJOR(conn->c_version),
- RDS_PROTOCOL_MINOR(conn->c_version),
- ic->i_flowctl ? ", flow control" : "");
-
- rds_connect_complete(conn);
-}
-
-static void rds_iw_cm_fill_conn_param(struct rds_connection *conn,
- struct rdma_conn_param *conn_param,
- struct rds_iw_connect_private *dp,
- u32 protocol_version)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- memset(conn_param, 0, sizeof(struct rdma_conn_param));
- /* XXX tune these? */
- conn_param->responder_resources = 1;
- conn_param->initiator_depth = 1;
-
- if (dp) {
- memset(dp, 0, sizeof(*dp));
- dp->dp_saddr = conn->c_laddr;
- dp->dp_daddr = conn->c_faddr;
- dp->dp_protocol_major = RDS_PROTOCOL_MAJOR(protocol_version);
- dp->dp_protocol_minor = RDS_PROTOCOL_MINOR(protocol_version);
- dp->dp_protocol_minor_mask = cpu_to_be16(RDS_IW_SUPPORTED_PROTOCOLS);
- dp->dp_ack_seq = rds_iw_piggyb_ack(ic);
-
- /* Advertise flow control */
- if (ic->i_flowctl) {
- unsigned int credits;
-
- credits = IB_GET_POST_CREDITS(atomic_read(&ic->i_credits));
- dp->dp_credit = cpu_to_be32(credits);
- atomic_sub(IB_SET_POST_CREDITS(credits), &ic->i_credits);
- }
-
- conn_param->private_data = dp;
- conn_param->private_data_len = sizeof(*dp);
- }
-}
-
-static void rds_iw_cq_event_handler(struct ib_event *event, void *data)
-{
- rdsdebug("event %u data %p\n", event->event, data);
-}
-
-static void rds_iw_qp_event_handler(struct ib_event *event, void *data)
-{
- struct rds_connection *conn = data;
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- rdsdebug("conn %p ic %p event %u\n", conn, ic, event->event);
-
- switch (event->event) {
- case IB_EVENT_COMM_EST:
- rdma_notify(ic->i_cm_id, IB_EVENT_COMM_EST);
- break;
- case IB_EVENT_QP_REQ_ERR:
- case IB_EVENT_QP_FATAL:
- default:
- rdsdebug("Fatal QP Event %u "
- "- connection %pI4->%pI4, reconnecting\n",
- event->event, &conn->c_laddr,
- &conn->c_faddr);
- rds_conn_drop(conn);
- break;
- }
-}
-
-/*
- * Create a QP
- */
-static int rds_iw_init_qp_attrs(struct ib_qp_init_attr *attr,
- struct rds_iw_device *rds_iwdev,
- struct rds_iw_work_ring *send_ring,
- void (*send_cq_handler)(struct ib_cq *, void *),
- struct rds_iw_work_ring *recv_ring,
- void (*recv_cq_handler)(struct ib_cq *, void *),
- void *context)
-{
- struct ib_device *dev = rds_iwdev->dev;
- struct ib_cq_init_attr cq_attr = {};
- unsigned int send_size, recv_size;
- int ret;
-
- /* The offset of 1 is to accommodate the additional ACK WR. */
- send_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_send_wr + 1);
- recv_size = min_t(unsigned int, rds_iwdev->max_wrs, rds_iw_sysctl_max_recv_wr + 1);
- rds_iw_ring_resize(send_ring, send_size - 1);
- rds_iw_ring_resize(recv_ring, recv_size - 1);
-
- memset(attr, 0, sizeof(*attr));
- attr->event_handler = rds_iw_qp_event_handler;
- attr->qp_context = context;
- attr->cap.max_send_wr = send_size;
- attr->cap.max_recv_wr = recv_size;
- attr->cap.max_send_sge = rds_iwdev->max_sge;
- attr->cap.max_recv_sge = RDS_IW_RECV_SGE;
- attr->sq_sig_type = IB_SIGNAL_REQ_WR;
- attr->qp_type = IB_QPT_RC;
-
- cq_attr.cqe = send_size;
- attr->send_cq = ib_create_cq(dev, send_cq_handler,
- rds_iw_cq_event_handler,
- context, &cq_attr);
- if (IS_ERR(attr->send_cq)) {
- ret = PTR_ERR(attr->send_cq);
- attr->send_cq = NULL;
- rdsdebug("ib_create_cq send failed: %d\n", ret);
- goto out;
- }
-
- cq_attr.cqe = recv_size;
- attr->recv_cq = ib_create_cq(dev, recv_cq_handler,
- rds_iw_cq_event_handler,
- context, &cq_attr);
- if (IS_ERR(attr->recv_cq)) {
- ret = PTR_ERR(attr->recv_cq);
- attr->recv_cq = NULL;
- rdsdebug("ib_create_cq send failed: %d\n", ret);
- goto out;
- }
-
- ret = ib_req_notify_cq(attr->send_cq, IB_CQ_NEXT_COMP);
- if (ret) {
- rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
- goto out;
- }
-
- ret = ib_req_notify_cq(attr->recv_cq, IB_CQ_SOLICITED);
- if (ret) {
- rdsdebug("ib_req_notify_cq recv failed: %d\n", ret);
- goto out;
- }
-
-out:
- if (ret) {
- if (attr->send_cq)
- ib_destroy_cq(attr->send_cq);
- if (attr->recv_cq)
- ib_destroy_cq(attr->recv_cq);
- }
- return ret;
-}
-
-/*
- * This needs to be very careful to not leave IS_ERR pointers around for
- * cleanup to trip over.
- */
-static int rds_iw_setup_qp(struct rds_connection *conn)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct ib_device *dev = ic->i_cm_id->device;
- struct ib_qp_init_attr attr;
- struct rds_iw_device *rds_iwdev;
- int ret;
-
- /* rds_iw_add_one creates a rds_iw_device object per IB device,
- * and allocates a protection domain, memory range and MR pool
- * for each. If that fails for any reason, it will not register
- * the rds_iwdev at all.
- */
- rds_iwdev = ib_get_client_data(dev, &rds_iw_client);
- if (!rds_iwdev) {
- printk_ratelimited(KERN_NOTICE "RDS/IW: No client_data for device %s\n",
- dev->name);
- return -EOPNOTSUPP;
- }
-
- /* Protection domain and memory range */
- ic->i_pd = rds_iwdev->pd;
- ic->i_mr = rds_iwdev->mr;
-
- ret = rds_iw_init_qp_attrs(&attr, rds_iwdev,
- &ic->i_send_ring, rds_iw_send_cq_comp_handler,
- &ic->i_recv_ring, rds_iw_recv_cq_comp_handler,
- conn);
- if (ret < 0)
- goto out;
-
- ic->i_send_cq = attr.send_cq;
- ic->i_recv_cq = attr.recv_cq;
-
- /*
- * XXX this can fail if max_*_wr is too large? Are we supposed
- * to back off until we get a value that the hardware can support?
- */
- ret = rdma_create_qp(ic->i_cm_id, ic->i_pd, &attr);
- if (ret) {
- rdsdebug("rdma_create_qp failed: %d\n", ret);
- goto out;
- }
-
- ic->i_send_hdrs = ib_dma_alloc_coherent(dev,
- ic->i_send_ring.w_nr *
- sizeof(struct rds_header),
- &ic->i_send_hdrs_dma, GFP_KERNEL);
- if (!ic->i_send_hdrs) {
- ret = -ENOMEM;
- rdsdebug("ib_dma_alloc_coherent send failed\n");
- goto out;
- }
-
- ic->i_recv_hdrs = ib_dma_alloc_coherent(dev,
- ic->i_recv_ring.w_nr *
- sizeof(struct rds_header),
- &ic->i_recv_hdrs_dma, GFP_KERNEL);
- if (!ic->i_recv_hdrs) {
- ret = -ENOMEM;
- rdsdebug("ib_dma_alloc_coherent recv failed\n");
- goto out;
- }
-
- ic->i_ack = ib_dma_alloc_coherent(dev, sizeof(struct rds_header),
- &ic->i_ack_dma, GFP_KERNEL);
- if (!ic->i_ack) {
- ret = -ENOMEM;
- rdsdebug("ib_dma_alloc_coherent ack failed\n");
- goto out;
- }
-
- ic->i_sends = vmalloc(ic->i_send_ring.w_nr * sizeof(struct rds_iw_send_work));
- if (!ic->i_sends) {
- ret = -ENOMEM;
- rdsdebug("send allocation failed\n");
- goto out;
- }
- rds_iw_send_init_ring(ic);
-
- ic->i_recvs = vmalloc(ic->i_recv_ring.w_nr * sizeof(struct rds_iw_recv_work));
- if (!ic->i_recvs) {
- ret = -ENOMEM;
- rdsdebug("recv allocation failed\n");
- goto out;
- }
-
- rds_iw_recv_init_ring(ic);
- rds_iw_recv_init_ack(ic);
-
- /* Post receive buffers - as a side effect, this will update
- * the posted credit count. */
- rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 1);
-
- rdsdebug("conn %p pd %p mr %p cq %p %p\n", conn, ic->i_pd, ic->i_mr,
- ic->i_send_cq, ic->i_recv_cq);
-
-out:
- return ret;
-}
-
-static u32 rds_iw_protocol_compatible(const struct rds_iw_connect_private *dp)
-{
- u16 common;
- u32 version = 0;
-
- /* rdma_cm private data is odd - when there is any private data in the
- * request, we will be given a pretty large buffer without telling us the
- * original size. The only way to tell the difference is by looking at
- * the contents, which are initialized to zero.
- * If the protocol version fields aren't set, this is a connection attempt
- * from an older version. This could could be 3.0 or 2.0 - we can't tell.
- * We really should have changed this for OFED 1.3 :-( */
- if (dp->dp_protocol_major == 0)
- return RDS_PROTOCOL_3_0;
-
- common = be16_to_cpu(dp->dp_protocol_minor_mask) & RDS_IW_SUPPORTED_PROTOCOLS;
- if (dp->dp_protocol_major == 3 && common) {
- version = RDS_PROTOCOL_3_0;
- while ((common >>= 1) != 0)
- version++;
- }
- printk_ratelimited(KERN_NOTICE "RDS: Connection from %pI4 using "
- "incompatible protocol version %u.%u\n",
- &dp->dp_saddr,
- dp->dp_protocol_major,
- dp->dp_protocol_minor);
- return version;
-}
-
-int rds_iw_cm_handle_connect(struct rdma_cm_id *cm_id,
- struct rdma_cm_event *event)
-{
- const struct rds_iw_connect_private *dp = event->param.conn.private_data;
- struct rds_iw_connect_private dp_rep;
- struct rds_connection *conn = NULL;
- struct rds_iw_connection *ic = NULL;
- struct rdma_conn_param conn_param;
- struct rds_iw_device *rds_iwdev;
- u32 version;
- int err, destroy = 1;
-
- /* Check whether the remote protocol version matches ours. */
- version = rds_iw_protocol_compatible(dp);
- if (!version)
- goto out;
-
- rdsdebug("saddr %pI4 daddr %pI4 RDSv%u.%u\n",
- &dp->dp_saddr, &dp->dp_daddr,
- RDS_PROTOCOL_MAJOR(version), RDS_PROTOCOL_MINOR(version));
-
- /* RDS/IW is not currently netns aware, thus init_net */
- conn = rds_conn_create(&init_net, dp->dp_daddr, dp->dp_saddr,
- &rds_iw_transport, GFP_KERNEL);
- if (IS_ERR(conn)) {
- rdsdebug("rds_conn_create failed (%ld)\n", PTR_ERR(conn));
- conn = NULL;
- goto out;
- }
-
- /*
- * The connection request may occur while the
- * previous connection exist, e.g. in case of failover.
- * But as connections may be initiated simultaneously
- * by both hosts, we have a random backoff mechanism -
- * see the comment above rds_queue_reconnect()
- */
- mutex_lock(&conn->c_cm_lock);
- if (!rds_conn_transition(conn, RDS_CONN_DOWN, RDS_CONN_CONNECTING)) {
- if (rds_conn_state(conn) == RDS_CONN_UP) {
- rdsdebug("incoming connect while connecting\n");
- rds_conn_drop(conn);
- rds_iw_stats_inc(s_iw_listen_closed_stale);
- } else
- if (rds_conn_state(conn) == RDS_CONN_CONNECTING) {
- /* Wait and see - our connect may still be succeeding */
- rds_iw_stats_inc(s_iw_connect_raced);
- }
- mutex_unlock(&conn->c_cm_lock);
- goto out;
- }
-
- ic = conn->c_transport_data;
-
- rds_iw_set_protocol(conn, version);
- rds_iw_set_flow_control(conn, be32_to_cpu(dp->dp_credit));
-
- /* If the peer gave us the last packet it saw, process this as if
- * we had received a regular ACK. */
- if (dp->dp_ack_seq)
- rds_send_drop_acked(conn, be64_to_cpu(dp->dp_ack_seq), NULL);
-
- BUG_ON(cm_id->context);
- BUG_ON(ic->i_cm_id);
-
- ic->i_cm_id = cm_id;
- cm_id->context = conn;
-
- rds_iwdev = ib_get_client_data(cm_id->device, &rds_iw_client);
- ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
-
- /* We got halfway through setting up the ib_connection, if we
- * fail now, we have to take the long route out of this mess. */
- destroy = 0;
-
- err = rds_iw_setup_qp(conn);
- if (err) {
- rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", err);
- mutex_unlock(&conn->c_cm_lock);
- goto out;
- }
-
- rds_iw_cm_fill_conn_param(conn, &conn_param, &dp_rep, version);
-
- /* rdma_accept() calls rdma_reject() internally if it fails */
- err = rdma_accept(cm_id, &conn_param);
- mutex_unlock(&conn->c_cm_lock);
- if (err) {
- rds_iw_conn_error(conn, "rdma_accept failed (%d)\n", err);
- goto out;
- }
-
- return 0;
-
-out:
- rdma_reject(cm_id, NULL, 0);
- return destroy;
-}
-
-
-int rds_iw_cm_initiate_connect(struct rdma_cm_id *cm_id)
-{
- struct rds_connection *conn = cm_id->context;
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct rdma_conn_param conn_param;
- struct rds_iw_connect_private dp;
- int ret;
-
- /* If the peer doesn't do protocol negotiation, we must
- * default to RDSv3.0 */
- rds_iw_set_protocol(conn, RDS_PROTOCOL_3_0);
- ic->i_flowctl = rds_iw_sysctl_flow_control; /* advertise flow control */
-
- ret = rds_iw_setup_qp(conn);
- if (ret) {
- rds_iw_conn_error(conn, "rds_iw_setup_qp failed (%d)\n", ret);
- goto out;
- }
-
- rds_iw_cm_fill_conn_param(conn, &conn_param, &dp, RDS_PROTOCOL_VERSION);
-
- ret = rdma_connect(cm_id, &conn_param);
- if (ret)
- rds_iw_conn_error(conn, "rdma_connect failed (%d)\n", ret);
-
-out:
- /* Beware - returning non-zero tells the rdma_cm to destroy
- * the cm_id. We should certainly not do it as long as we still
- * "own" the cm_id. */
- if (ret) {
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- if (ic->i_cm_id == cm_id)
- ret = 0;
- }
- return ret;
-}
-
-int rds_iw_conn_connect(struct rds_connection *conn)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct rds_iw_device *rds_iwdev;
- struct sockaddr_in src, dest;
- int ret;
-
- /* XXX I wonder what affect the port space has */
- /* delegate cm event handler to rdma_transport */
- ic->i_cm_id = rdma_create_id(rds_rdma_cm_event_handler, conn,
- RDMA_PS_TCP, IB_QPT_RC);
- if (IS_ERR(ic->i_cm_id)) {
- ret = PTR_ERR(ic->i_cm_id);
- ic->i_cm_id = NULL;
- rdsdebug("rdma_create_id() failed: %d\n", ret);
- goto out;
- }
-
- rdsdebug("created cm id %p for conn %p\n", ic->i_cm_id, conn);
-
- src.sin_family = AF_INET;
- src.sin_addr.s_addr = (__force u32)conn->c_laddr;
- src.sin_port = (__force u16)htons(0);
-
- /* First, bind to the local address and device. */
- ret = rdma_bind_addr(ic->i_cm_id, (struct sockaddr *) &src);
- if (ret) {
- rdsdebug("rdma_bind_addr(%pI4) failed: %d\n",
- &conn->c_laddr, ret);
- rdma_destroy_id(ic->i_cm_id);
- ic->i_cm_id = NULL;
- goto out;
- }
-
- rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
- ic->i_dma_local_lkey = rds_iwdev->dma_local_lkey;
-
- dest.sin_family = AF_INET;
- dest.sin_addr.s_addr = (__force u32)conn->c_faddr;
- dest.sin_port = (__force u16)htons(RDS_PORT);
-
- ret = rdma_resolve_addr(ic->i_cm_id, (struct sockaddr *)&src,
- (struct sockaddr *)&dest,
- RDS_RDMA_RESOLVE_TIMEOUT_MS);
- if (ret) {
- rdsdebug("addr resolve failed for cm id %p: %d\n", ic->i_cm_id,
- ret);
- rdma_destroy_id(ic->i_cm_id);
- ic->i_cm_id = NULL;
- }
-
-out:
- return ret;
-}
-
-/*
- * This is so careful about only cleaning up resources that were built up
- * so that it can be called at any point during startup. In fact it
- * can be called multiple times for a given connection.
- */
-void rds_iw_conn_shutdown(struct rds_connection *conn)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- int err = 0;
- struct ib_qp_attr qp_attr;
-
- rdsdebug("cm %p pd %p cq %p %p qp %p\n", ic->i_cm_id,
- ic->i_pd, ic->i_send_cq, ic->i_recv_cq,
- ic->i_cm_id ? ic->i_cm_id->qp : NULL);
-
- if (ic->i_cm_id) {
- struct ib_device *dev = ic->i_cm_id->device;
-
- rdsdebug("disconnecting cm %p\n", ic->i_cm_id);
- err = rdma_disconnect(ic->i_cm_id);
- if (err) {
- /* Actually this may happen quite frequently, when
- * an outgoing connect raced with an incoming connect.
- */
- rdsdebug("failed to disconnect, cm: %p err %d\n",
- ic->i_cm_id, err);
- }
-
- if (ic->i_cm_id->qp) {
- qp_attr.qp_state = IB_QPS_ERR;
- ib_modify_qp(ic->i_cm_id->qp, &qp_attr, IB_QP_STATE);
- }
-
- wait_event(rds_iw_ring_empty_wait,
- rds_iw_ring_empty(&ic->i_send_ring) &&
- rds_iw_ring_empty(&ic->i_recv_ring));
-
- if (ic->i_send_hdrs)
- ib_dma_free_coherent(dev,
- ic->i_send_ring.w_nr *
- sizeof(struct rds_header),
- ic->i_send_hdrs,
- ic->i_send_hdrs_dma);
-
- if (ic->i_recv_hdrs)
- ib_dma_free_coherent(dev,
- ic->i_recv_ring.w_nr *
- sizeof(struct rds_header),
- ic->i_recv_hdrs,
- ic->i_recv_hdrs_dma);
-
- if (ic->i_ack)
- ib_dma_free_coherent(dev, sizeof(struct rds_header),
- ic->i_ack, ic->i_ack_dma);
-
- if (ic->i_sends)
- rds_iw_send_clear_ring(ic);
- if (ic->i_recvs)
- rds_iw_recv_clear_ring(ic);
-
- if (ic->i_cm_id->qp)
- rdma_destroy_qp(ic->i_cm_id);
- if (ic->i_send_cq)
- ib_destroy_cq(ic->i_send_cq);
- if (ic->i_recv_cq)
- ib_destroy_cq(ic->i_recv_cq);
-
- /*
- * If associated with an rds_iw_device:
- * Move connection back to the nodev list.
- * Remove cm_id from the device cm_id list.
- */
- if (ic->rds_iwdev)
- rds_iw_remove_conn(ic->rds_iwdev, conn);
-
- rdma_destroy_id(ic->i_cm_id);
-
- ic->i_cm_id = NULL;
- ic->i_pd = NULL;
- ic->i_mr = NULL;
- ic->i_send_cq = NULL;
- ic->i_recv_cq = NULL;
- ic->i_send_hdrs = NULL;
- ic->i_recv_hdrs = NULL;
- ic->i_ack = NULL;
- }
- BUG_ON(ic->rds_iwdev);
-
- /* Clear pending transmit */
- if (ic->i_rm) {
- rds_message_put(ic->i_rm);
- ic->i_rm = NULL;
- }
-
- /* Clear the ACK state */
- clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
-#ifdef KERNEL_HAS_ATOMIC64
- atomic64_set(&ic->i_ack_next, 0);
-#else
- ic->i_ack_next = 0;
-#endif
- ic->i_ack_recv = 0;
-
- /* Clear flow control state */
- ic->i_flowctl = 0;
- atomic_set(&ic->i_credits, 0);
-
- rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
- rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
-
- if (ic->i_iwinc) {
- rds_inc_put(&ic->i_iwinc->ii_inc);
- ic->i_iwinc = NULL;
- }
-
- vfree(ic->i_sends);
- ic->i_sends = NULL;
- vfree(ic->i_recvs);
- ic->i_recvs = NULL;
- rdsdebug("shutdown complete\n");
-}
-
-int rds_iw_conn_alloc(struct rds_connection *conn, gfp_t gfp)
-{
- struct rds_iw_connection *ic;
- unsigned long flags;
-
- /* XXX too lazy? */
- ic = kzalloc(sizeof(struct rds_iw_connection), gfp);
- if (!ic)
- return -ENOMEM;
-
- INIT_LIST_HEAD(&ic->iw_node);
- tasklet_init(&ic->i_recv_tasklet, rds_iw_recv_tasklet_fn,
- (unsigned long) ic);
- mutex_init(&ic->i_recv_mutex);
-#ifndef KERNEL_HAS_ATOMIC64
- spin_lock_init(&ic->i_ack_lock);
-#endif
-
- /*
- * rds_iw_conn_shutdown() waits for these to be emptied so they
- * must be initialized before it can be called.
- */
- rds_iw_ring_init(&ic->i_send_ring, rds_iw_sysctl_max_send_wr);
- rds_iw_ring_init(&ic->i_recv_ring, rds_iw_sysctl_max_recv_wr);
-
- ic->conn = conn;
- conn->c_transport_data = ic;
-
- spin_lock_irqsave(&iw_nodev_conns_lock, flags);
- list_add_tail(&ic->iw_node, &iw_nodev_conns);
- spin_unlock_irqrestore(&iw_nodev_conns_lock, flags);
-
-
- rdsdebug("conn %p conn ic %p\n", conn, conn->c_transport_data);
- return 0;
-}
-
-/*
- * Free a connection. Connection must be shut down and not set for reconnect.
- */
-void rds_iw_conn_free(void *arg)
-{
- struct rds_iw_connection *ic = arg;
- spinlock_t *lock_ptr;
-
- rdsdebug("ic %p\n", ic);
-
- /*
- * Conn is either on a dev's list or on the nodev list.
- * A race with shutdown() or connect() would cause problems
- * (since rds_iwdev would change) but that should never happen.
- */
- lock_ptr = ic->rds_iwdev ? &ic->rds_iwdev->spinlock : &iw_nodev_conns_lock;
-
- spin_lock_irq(lock_ptr);
- list_del(&ic->iw_node);
- spin_unlock_irq(lock_ptr);
-
- kfree(ic);
-}
-
-/*
- * An error occurred on the connection
- */
-void
-__rds_iw_conn_error(struct rds_connection *conn, const char *fmt, ...)
-{
- va_list ap;
-
- rds_conn_drop(conn);
-
- va_start(ap, fmt);
- vprintk(fmt, ap);
- va_end(ap);
-}
diff --git a/net/rds/iw_rdma.c b/net/rds/iw_rdma.c
deleted file mode 100644
index d3d4454ffc84..000000000000
--- a/net/rds/iw_rdma.c
+++ /dev/null
@@ -1,874 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/ratelimit.h>
-
-#include "rds.h"
-#include "iw.h"
-
-
-/*
- * This is stored as mr->r_trans_private.
- */
-struct rds_iw_mr {
- struct rds_iw_device *device;
- struct rds_iw_mr_pool *pool;
- struct rdma_cm_id *cm_id;
-
- struct ib_mr *mr;
- struct ib_fast_reg_page_list *page_list;
-
- struct rds_iw_mapping mapping;
- unsigned char remap_count;
-};
-
-/*
- * Our own little MR pool
- */
-struct rds_iw_mr_pool {
- struct rds_iw_device *device; /* back ptr to the device that owns us */
-
- struct mutex flush_lock; /* serialize fmr invalidate */
- struct work_struct flush_worker; /* flush worker */
-
- spinlock_t list_lock; /* protect variables below */
- atomic_t item_count; /* total # of MRs */
- atomic_t dirty_count; /* # dirty of MRs */
- struct list_head dirty_list; /* dirty mappings */
- struct list_head clean_list; /* unused & unamapped MRs */
- atomic_t free_pinned; /* memory pinned by free MRs */
- unsigned long max_message_size; /* in pages */
- unsigned long max_items;
- unsigned long max_items_soft;
- unsigned long max_free_pinned;
- int max_pages;
-};
-
-static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all);
-static void rds_iw_mr_pool_flush_worker(struct work_struct *work);
-static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
-static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
- struct rds_iw_mr *ibmr,
- struct scatterlist *sg, unsigned int nents);
-static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
-static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
- struct list_head *unmap_list,
- struct list_head *kill_list,
- int *unpinned);
-static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool, struct rds_iw_mr *ibmr);
-
-static int rds_iw_get_device(struct sockaddr_in *src, struct sockaddr_in *dst,
- struct rds_iw_device **rds_iwdev,
- struct rdma_cm_id **cm_id)
-{
- struct rds_iw_device *iwdev;
- struct rds_iw_cm_id *i_cm_id;
-
- *rds_iwdev = NULL;
- *cm_id = NULL;
-
- list_for_each_entry(iwdev, &rds_iw_devices, list) {
- spin_lock_irq(&iwdev->spinlock);
- list_for_each_entry(i_cm_id, &iwdev->cm_id_list, list) {
- struct sockaddr_in *src_addr, *dst_addr;
-
- src_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.src_addr;
- dst_addr = (struct sockaddr_in *)&i_cm_id->cm_id->route.addr.dst_addr;
-
- rdsdebug("local ipaddr = %x port %d, "
- "remote ipaddr = %x port %d"
- "..looking for %x port %d, "
- "remote ipaddr = %x port %d\n",
- src_addr->sin_addr.s_addr,
- src_addr->sin_port,
- dst_addr->sin_addr.s_addr,
- dst_addr->sin_port,
- src->sin_addr.s_addr,
- src->sin_port,
- dst->sin_addr.s_addr,
- dst->sin_port);
-#ifdef WORKING_TUPLE_DETECTION
- if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr &&
- src_addr->sin_port == src->sin_port &&
- dst_addr->sin_addr.s_addr == dst->sin_addr.s_addr &&
- dst_addr->sin_port == dst->sin_port) {
-#else
- /* FIXME - needs to compare the local and remote
- * ipaddr/port tuple, but the ipaddr is the only
- * available information in the rds_sock (as the rest are
- * zero'ed. It doesn't appear to be properly populated
- * during connection setup...
- */
- if (src_addr->sin_addr.s_addr == src->sin_addr.s_addr) {
-#endif
- spin_unlock_irq(&iwdev->spinlock);
- *rds_iwdev = iwdev;
- *cm_id = i_cm_id->cm_id;
- return 0;
- }
- }
- spin_unlock_irq(&iwdev->spinlock);
- }
-
- return 1;
-}
-
-static int rds_iw_add_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
-{
- struct rds_iw_cm_id *i_cm_id;
-
- i_cm_id = kmalloc(sizeof *i_cm_id, GFP_KERNEL);
- if (!i_cm_id)
- return -ENOMEM;
-
- i_cm_id->cm_id = cm_id;
-
- spin_lock_irq(&rds_iwdev->spinlock);
- list_add_tail(&i_cm_id->list, &rds_iwdev->cm_id_list);
- spin_unlock_irq(&rds_iwdev->spinlock);
-
- return 0;
-}
-
-static void rds_iw_remove_cm_id(struct rds_iw_device *rds_iwdev,
- struct rdma_cm_id *cm_id)
-{
- struct rds_iw_cm_id *i_cm_id;
-
- spin_lock_irq(&rds_iwdev->spinlock);
- list_for_each_entry(i_cm_id, &rds_iwdev->cm_id_list, list) {
- if (i_cm_id->cm_id == cm_id) {
- list_del(&i_cm_id->list);
- kfree(i_cm_id);
- break;
- }
- }
- spin_unlock_irq(&rds_iwdev->spinlock);
-}
-
-
-int rds_iw_update_cm_id(struct rds_iw_device *rds_iwdev, struct rdma_cm_id *cm_id)
-{
- struct sockaddr_in *src_addr, *dst_addr;
- struct rds_iw_device *rds_iwdev_old;
- struct rdma_cm_id *pcm_id;
- int rc;
-
- src_addr = (struct sockaddr_in *)&cm_id->route.addr.src_addr;
- dst_addr = (struct sockaddr_in *)&cm_id->route.addr.dst_addr;
-
- rc = rds_iw_get_device(src_addr, dst_addr, &rds_iwdev_old, &pcm_id);
- if (rc)
- rds_iw_remove_cm_id(rds_iwdev, cm_id);
-
- return rds_iw_add_cm_id(rds_iwdev, cm_id);
-}
-
-void rds_iw_add_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- /* conn was previously on the nodev_conns_list */
- spin_lock_irq(&iw_nodev_conns_lock);
- BUG_ON(list_empty(&iw_nodev_conns));
- BUG_ON(list_empty(&ic->iw_node));
- list_del(&ic->iw_node);
-
- spin_lock(&rds_iwdev->spinlock);
- list_add_tail(&ic->iw_node, &rds_iwdev->conn_list);
- spin_unlock(&rds_iwdev->spinlock);
- spin_unlock_irq(&iw_nodev_conns_lock);
-
- ic->rds_iwdev = rds_iwdev;
-}
-
-void rds_iw_remove_conn(struct rds_iw_device *rds_iwdev, struct rds_connection *conn)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- /* place conn on nodev_conns_list */
- spin_lock(&iw_nodev_conns_lock);
-
- spin_lock_irq(&rds_iwdev->spinlock);
- BUG_ON(list_empty(&ic->iw_node));
- list_del(&ic->iw_node);
- spin_unlock_irq(&rds_iwdev->spinlock);
-
- list_add_tail(&ic->iw_node, &iw_nodev_conns);
-
- spin_unlock(&iw_nodev_conns_lock);
-
- rds_iw_remove_cm_id(ic->rds_iwdev, ic->i_cm_id);
- ic->rds_iwdev = NULL;
-}
-
-void __rds_iw_destroy_conns(struct list_head *list, spinlock_t *list_lock)
-{
- struct rds_iw_connection *ic, *_ic;
- LIST_HEAD(tmp_list);
-
- /* avoid calling conn_destroy with irqs off */
- spin_lock_irq(list_lock);
- list_splice(list, &tmp_list);
- INIT_LIST_HEAD(list);
- spin_unlock_irq(list_lock);
-
- list_for_each_entry_safe(ic, _ic, &tmp_list, iw_node)
- rds_conn_destroy(ic->conn);
-}
-
-static void rds_iw_set_scatterlist(struct rds_iw_scatterlist *sg,
- struct scatterlist *list, unsigned int sg_len)
-{
- sg->list = list;
- sg->len = sg_len;
- sg->dma_len = 0;
- sg->dma_npages = 0;
- sg->bytes = 0;
-}
-
-static u64 *rds_iw_map_scatterlist(struct rds_iw_device *rds_iwdev,
- struct rds_iw_scatterlist *sg)
-{
- struct ib_device *dev = rds_iwdev->dev;
- u64 *dma_pages = NULL;
- int i, j, ret;
-
- WARN_ON(sg->dma_len);
-
- sg->dma_len = ib_dma_map_sg(dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
- if (unlikely(!sg->dma_len)) {
- printk(KERN_WARNING "RDS/IW: dma_map_sg failed!\n");
- return ERR_PTR(-EBUSY);
- }
-
- sg->bytes = 0;
- sg->dma_npages = 0;
-
- ret = -EINVAL;
- for (i = 0; i < sg->dma_len; ++i) {
- unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
- u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
- u64 end_addr;
-
- sg->bytes += dma_len;
-
- end_addr = dma_addr + dma_len;
- if (dma_addr & PAGE_MASK) {
- if (i > 0)
- goto out_unmap;
- dma_addr &= ~PAGE_MASK;
- }
- if (end_addr & PAGE_MASK) {
- if (i < sg->dma_len - 1)
- goto out_unmap;
- end_addr = (end_addr + PAGE_MASK) & ~PAGE_MASK;
- }
-
- sg->dma_npages += (end_addr - dma_addr) >> PAGE_SHIFT;
- }
-
- /* Now gather the dma addrs into one list */
- if (sg->dma_npages > fastreg_message_size)
- goto out_unmap;
-
- dma_pages = kmalloc(sizeof(u64) * sg->dma_npages, GFP_ATOMIC);
- if (!dma_pages) {
- ret = -ENOMEM;
- goto out_unmap;
- }
-
- for (i = j = 0; i < sg->dma_len; ++i) {
- unsigned int dma_len = ib_sg_dma_len(dev, &sg->list[i]);
- u64 dma_addr = ib_sg_dma_address(dev, &sg->list[i]);
- u64 end_addr;
-
- end_addr = dma_addr + dma_len;
- dma_addr &= ~PAGE_MASK;
- for (; dma_addr < end_addr; dma_addr += PAGE_SIZE)
- dma_pages[j++] = dma_addr;
- BUG_ON(j > sg->dma_npages);
- }
-
- return dma_pages;
-
-out_unmap:
- ib_dma_unmap_sg(rds_iwdev->dev, sg->list, sg->len, DMA_BIDIRECTIONAL);
- sg->dma_len = 0;
- kfree(dma_pages);
- return ERR_PTR(ret);
-}
-
-
-struct rds_iw_mr_pool *rds_iw_create_mr_pool(struct rds_iw_device *rds_iwdev)
-{
- struct rds_iw_mr_pool *pool;
-
- pool = kzalloc(sizeof(*pool), GFP_KERNEL);
- if (!pool) {
- printk(KERN_WARNING "RDS/IW: rds_iw_create_mr_pool alloc error\n");
- return ERR_PTR(-ENOMEM);
- }
-
- pool->device = rds_iwdev;
- INIT_LIST_HEAD(&pool->dirty_list);
- INIT_LIST_HEAD(&pool->clean_list);
- mutex_init(&pool->flush_lock);
- spin_lock_init(&pool->list_lock);
- INIT_WORK(&pool->flush_worker, rds_iw_mr_pool_flush_worker);
-
- pool->max_message_size = fastreg_message_size;
- pool->max_items = fastreg_pool_size;
- pool->max_free_pinned = pool->max_items * pool->max_message_size / 4;
- pool->max_pages = fastreg_message_size;
-
- /* We never allow more than max_items MRs to be allocated.
- * When we exceed more than max_items_soft, we start freeing
- * items more aggressively.
- * Make sure that max_items > max_items_soft > max_items / 2
- */
- pool->max_items_soft = pool->max_items * 3 / 4;
-
- return pool;
-}
-
-void rds_iw_get_mr_info(struct rds_iw_device *rds_iwdev, struct rds_info_rdma_connection *iinfo)
-{
- struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
-
- iinfo->rdma_mr_max = pool->max_items;
- iinfo->rdma_mr_size = pool->max_pages;
-}
-
-void rds_iw_destroy_mr_pool(struct rds_iw_mr_pool *pool)
-{
- flush_workqueue(rds_wq);
- rds_iw_flush_mr_pool(pool, 1);
- BUG_ON(atomic_read(&pool->item_count));
- BUG_ON(atomic_read(&pool->free_pinned));
- kfree(pool);
-}
-
-static inline struct rds_iw_mr *rds_iw_reuse_fmr(struct rds_iw_mr_pool *pool)
-{
- struct rds_iw_mr *ibmr = NULL;
- unsigned long flags;
-
- spin_lock_irqsave(&pool->list_lock, flags);
- if (!list_empty(&pool->clean_list)) {
- ibmr = list_entry(pool->clean_list.next, struct rds_iw_mr, mapping.m_list);
- list_del_init(&ibmr->mapping.m_list);
- }
- spin_unlock_irqrestore(&pool->list_lock, flags);
-
- return ibmr;
-}
-
-static struct rds_iw_mr *rds_iw_alloc_mr(struct rds_iw_device *rds_iwdev)
-{
- struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
- struct rds_iw_mr *ibmr = NULL;
- int err = 0, iter = 0;
-
- while (1) {
- ibmr = rds_iw_reuse_fmr(pool);
- if (ibmr)
- return ibmr;
-
- /* No clean MRs - now we have the choice of either
- * allocating a fresh MR up to the limit imposed by the
- * driver, or flush any dirty unused MRs.
- * We try to avoid stalling in the send path if possible,
- * so we allocate as long as we're allowed to.
- *
- * We're fussy with enforcing the FMR limit, though. If the driver
- * tells us we can't use more than N fmrs, we shouldn't start
- * arguing with it */
- if (atomic_inc_return(&pool->item_count) <= pool->max_items)
- break;
-
- atomic_dec(&pool->item_count);
-
- if (++iter > 2) {
- rds_iw_stats_inc(s_iw_rdma_mr_pool_depleted);
- return ERR_PTR(-EAGAIN);
- }
-
- /* We do have some empty MRs. Flush them out. */
- rds_iw_stats_inc(s_iw_rdma_mr_pool_wait);
- rds_iw_flush_mr_pool(pool, 0);
- }
-
- ibmr = kzalloc(sizeof(*ibmr), GFP_KERNEL);
- if (!ibmr) {
- err = -ENOMEM;
- goto out_no_cigar;
- }
-
- spin_lock_init(&ibmr->mapping.m_lock);
- INIT_LIST_HEAD(&ibmr->mapping.m_list);
- ibmr->mapping.m_mr = ibmr;
-
- err = rds_iw_init_fastreg(pool, ibmr);
- if (err)
- goto out_no_cigar;
-
- rds_iw_stats_inc(s_iw_rdma_mr_alloc);
- return ibmr;
-
-out_no_cigar:
- if (ibmr) {
- rds_iw_destroy_fastreg(pool, ibmr);
- kfree(ibmr);
- }
- atomic_dec(&pool->item_count);
- return ERR_PTR(err);
-}
-
-void rds_iw_sync_mr(void *trans_private, int direction)
-{
- struct rds_iw_mr *ibmr = trans_private;
- struct rds_iw_device *rds_iwdev = ibmr->device;
-
- switch (direction) {
- case DMA_FROM_DEVICE:
- ib_dma_sync_sg_for_cpu(rds_iwdev->dev, ibmr->mapping.m_sg.list,
- ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
- break;
- case DMA_TO_DEVICE:
- ib_dma_sync_sg_for_device(rds_iwdev->dev, ibmr->mapping.m_sg.list,
- ibmr->mapping.m_sg.dma_len, DMA_BIDIRECTIONAL);
- break;
- }
-}
-
-/*
- * Flush our pool of MRs.
- * At a minimum, all currently unused MRs are unmapped.
- * If the number of MRs allocated exceeds the limit, we also try
- * to free as many MRs as needed to get back to this limit.
- */
-static void rds_iw_flush_mr_pool(struct rds_iw_mr_pool *pool, int free_all)
-{
- struct rds_iw_mr *ibmr, *next;
- LIST_HEAD(unmap_list);
- LIST_HEAD(kill_list);
- unsigned long flags;
- unsigned int nfreed = 0, ncleaned = 0, unpinned = 0;
-
- rds_iw_stats_inc(s_iw_rdma_mr_pool_flush);
-
- mutex_lock(&pool->flush_lock);
-
- spin_lock_irqsave(&pool->list_lock, flags);
- /* Get the list of all mappings to be destroyed */
- list_splice_init(&pool->dirty_list, &unmap_list);
- if (free_all)
- list_splice_init(&pool->clean_list, &kill_list);
- spin_unlock_irqrestore(&pool->list_lock, flags);
-
- /* Batched invalidate of dirty MRs.
- * For FMR based MRs, the mappings on the unmap list are
- * actually members of an ibmr (ibmr->mapping). They either
- * migrate to the kill_list, or have been cleaned and should be
- * moved to the clean_list.
- * For fastregs, they will be dynamically allocated, and
- * will be destroyed by the unmap function.
- */
- if (!list_empty(&unmap_list)) {
- ncleaned = rds_iw_unmap_fastreg_list(pool, &unmap_list,
- &kill_list, &unpinned);
- /* If we've been asked to destroy all MRs, move those
- * that were simply cleaned to the kill list */
- if (free_all)
- list_splice_init(&unmap_list, &kill_list);
- }
-
- /* Destroy any MRs that are past their best before date */
- list_for_each_entry_safe(ibmr, next, &kill_list, mapping.m_list) {
- rds_iw_stats_inc(s_iw_rdma_mr_free);
- list_del(&ibmr->mapping.m_list);
- rds_iw_destroy_fastreg(pool, ibmr);
- kfree(ibmr);
- nfreed++;
- }
-
- /* Anything that remains are laundered ibmrs, which we can add
- * back to the clean list. */
- if (!list_empty(&unmap_list)) {
- spin_lock_irqsave(&pool->list_lock, flags);
- list_splice(&unmap_list, &pool->clean_list);
- spin_unlock_irqrestore(&pool->list_lock, flags);
- }
-
- atomic_sub(unpinned, &pool->free_pinned);
- atomic_sub(ncleaned, &pool->dirty_count);
- atomic_sub(nfreed, &pool->item_count);
-
- mutex_unlock(&pool->flush_lock);
-}
-
-static void rds_iw_mr_pool_flush_worker(struct work_struct *work)
-{
- struct rds_iw_mr_pool *pool = container_of(work, struct rds_iw_mr_pool, flush_worker);
-
- rds_iw_flush_mr_pool(pool, 0);
-}
-
-void rds_iw_free_mr(void *trans_private, int invalidate)
-{
- struct rds_iw_mr *ibmr = trans_private;
- struct rds_iw_mr_pool *pool = ibmr->device->mr_pool;
-
- rdsdebug("RDS/IW: free_mr nents %u\n", ibmr->mapping.m_sg.len);
- if (!pool)
- return;
-
- /* Return it to the pool's free list */
- rds_iw_free_fastreg(pool, ibmr);
-
- /* If we've pinned too many pages, request a flush */
- if (atomic_read(&pool->free_pinned) >= pool->max_free_pinned ||
- atomic_read(&pool->dirty_count) >= pool->max_items / 10)
- queue_work(rds_wq, &pool->flush_worker);
-
- if (invalidate) {
- if (likely(!in_interrupt())) {
- rds_iw_flush_mr_pool(pool, 0);
- } else {
- /* We get here if the user created a MR marked
- * as use_once and invalidate at the same time. */
- queue_work(rds_wq, &pool->flush_worker);
- }
- }
-}
-
-void rds_iw_flush_mrs(void)
-{
- struct rds_iw_device *rds_iwdev;
-
- list_for_each_entry(rds_iwdev, &rds_iw_devices, list) {
- struct rds_iw_mr_pool *pool = rds_iwdev->mr_pool;
-
- if (pool)
- rds_iw_flush_mr_pool(pool, 0);
- }
-}
-
-void *rds_iw_get_mr(struct scatterlist *sg, unsigned long nents,
- struct rds_sock *rs, u32 *key_ret)
-{
- struct rds_iw_device *rds_iwdev;
- struct rds_iw_mr *ibmr = NULL;
- struct rdma_cm_id *cm_id;
- struct sockaddr_in src = {
- .sin_addr.s_addr = rs->rs_bound_addr,
- .sin_port = rs->rs_bound_port,
- };
- struct sockaddr_in dst = {
- .sin_addr.s_addr = rs->rs_conn_addr,
- .sin_port = rs->rs_conn_port,
- };
- int ret;
-
- ret = rds_iw_get_device(&src, &dst, &rds_iwdev, &cm_id);
- if (ret || !cm_id) {
- ret = -ENODEV;
- goto out;
- }
-
- if (!rds_iwdev->mr_pool) {
- ret = -ENODEV;
- goto out;
- }
-
- ibmr = rds_iw_alloc_mr(rds_iwdev);
- if (IS_ERR(ibmr))
- return ibmr;
-
- ibmr->cm_id = cm_id;
- ibmr->device = rds_iwdev;
-
- ret = rds_iw_map_fastreg(rds_iwdev->mr_pool, ibmr, sg, nents);
- if (ret == 0)
- *key_ret = ibmr->mr->rkey;
- else
- printk(KERN_WARNING "RDS/IW: failed to map mr (errno=%d)\n", ret);
-
-out:
- if (ret) {
- if (ibmr)
- rds_iw_free_mr(ibmr, 0);
- ibmr = ERR_PTR(ret);
- }
- return ibmr;
-}
-
-/*
- * iWARP fastreg handling
- *
- * The life cycle of a fastreg registration is a bit different from
- * FMRs.
- * The idea behind fastreg is to have one MR, to which we bind different
- * mappings over time. To avoid stalling on the expensive map and invalidate
- * operations, these operations are pipelined on the same send queue on
- * which we want to send the message containing the r_key.
- *
- * This creates a bit of a problem for us, as we do not have the destination
- * IP in GET_MR, so the connection must be setup prior to the GET_MR call for
- * RDMA to be correctly setup. If a fastreg request is present, rds_iw_xmit
- * will try to queue a LOCAL_INV (if needed) and a FAST_REG_MR work request
- * before queuing the SEND. When completions for these arrive, they are
- * dispatched to the MR has a bit set showing that RDMa can be performed.
- *
- * There is another interesting aspect that's related to invalidation.
- * The application can request that a mapping is invalidated in FREE_MR.
- * The expectation there is that this invalidation step includes ALL
- * PREVIOUSLY FREED MRs.
- */
-static int rds_iw_init_fastreg(struct rds_iw_mr_pool *pool,
- struct rds_iw_mr *ibmr)
-{
- struct rds_iw_device *rds_iwdev = pool->device;
- struct ib_fast_reg_page_list *page_list = NULL;
- struct ib_mr *mr;
- int err;
-
- mr = ib_alloc_mr(rds_iwdev->pd, IB_MR_TYPE_MEM_REG,
- pool->max_message_size);
- if (IS_ERR(mr)) {
- err = PTR_ERR(mr);
-
- printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed (err=%d)\n", err);
- return err;
- }
-
- /* FIXME - this is overkill, but mapping->m_sg.dma_len/mapping->m_sg.dma_npages
- * is not filled in.
- */
- page_list = ib_alloc_fast_reg_page_list(rds_iwdev->dev, pool->max_message_size);
- if (IS_ERR(page_list)) {
- err = PTR_ERR(page_list);
-
- printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed (err=%d)\n", err);
- ib_dereg_mr(mr);
- return err;
- }
-
- ibmr->page_list = page_list;
- ibmr->mr = mr;
- return 0;
-}
-
-static int rds_iw_rdma_build_fastreg(struct rds_iw_mapping *mapping)
-{
- struct rds_iw_mr *ibmr = mapping->m_mr;
- struct ib_send_wr f_wr, *failed_wr;
- int ret;
-
- /*
- * Perform a WR for the fast_reg_mr. Each individual page
- * in the sg list is added to the fast reg page list and placed
- * inside the fast_reg_mr WR. The key used is a rolling 8bit
- * counter, which should guarantee uniqueness.
- */
- ib_update_fast_reg_key(ibmr->mr, ibmr->remap_count++);
- mapping->m_rkey = ibmr->mr->rkey;
-
- memset(&f_wr, 0, sizeof(f_wr));
- f_wr.wr_id = RDS_IW_FAST_REG_WR_ID;
- f_wr.opcode = IB_WR_FAST_REG_MR;
- f_wr.wr.fast_reg.length = mapping->m_sg.bytes;
- f_wr.wr.fast_reg.rkey = mapping->m_rkey;
- f_wr.wr.fast_reg.page_list = ibmr->page_list;
- f_wr.wr.fast_reg.page_list_len = mapping->m_sg.dma_len;
- f_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- f_wr.wr.fast_reg.access_flags = IB_ACCESS_LOCAL_WRITE |
- IB_ACCESS_REMOTE_READ |
- IB_ACCESS_REMOTE_WRITE;
- f_wr.wr.fast_reg.iova_start = 0;
- f_wr.send_flags = IB_SEND_SIGNALED;
-
- failed_wr = &f_wr;
- ret = ib_post_send(ibmr->cm_id->qp, &f_wr, &failed_wr);
- BUG_ON(failed_wr != &f_wr);
- if (ret)
- printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
- __func__, __LINE__, ret);
- return ret;
-}
-
-static int rds_iw_rdma_fastreg_inv(struct rds_iw_mr *ibmr)
-{
- struct ib_send_wr s_wr, *failed_wr;
- int ret = 0;
-
- if (!ibmr->cm_id->qp || !ibmr->mr)
- goto out;
-
- memset(&s_wr, 0, sizeof(s_wr));
- s_wr.wr_id = RDS_IW_LOCAL_INV_WR_ID;
- s_wr.opcode = IB_WR_LOCAL_INV;
- s_wr.ex.invalidate_rkey = ibmr->mr->rkey;
- s_wr.send_flags = IB_SEND_SIGNALED;
-
- failed_wr = &s_wr;
- ret = ib_post_send(ibmr->cm_id->qp, &s_wr, &failed_wr);
- if (ret) {
- printk_ratelimited(KERN_WARNING "RDS/IW: %s:%d ib_post_send returned %d\n",
- __func__, __LINE__, ret);
- goto out;
- }
-out:
- return ret;
-}
-
-static int rds_iw_map_fastreg(struct rds_iw_mr_pool *pool,
- struct rds_iw_mr *ibmr,
- struct scatterlist *sg,
- unsigned int sg_len)
-{
- struct rds_iw_device *rds_iwdev = pool->device;
- struct rds_iw_mapping *mapping = &ibmr->mapping;
- u64 *dma_pages;
- int i, ret = 0;
-
- rds_iw_set_scatterlist(&mapping->m_sg, sg, sg_len);
-
- dma_pages = rds_iw_map_scatterlist(rds_iwdev, &mapping->m_sg);
- if (IS_ERR(dma_pages)) {
- ret = PTR_ERR(dma_pages);
- dma_pages = NULL;
- goto out;
- }
-
- if (mapping->m_sg.dma_len > pool->max_message_size) {
- ret = -EMSGSIZE;
- goto out;
- }
-
- for (i = 0; i < mapping->m_sg.dma_npages; ++i)
- ibmr->page_list->page_list[i] = dma_pages[i];
-
- ret = rds_iw_rdma_build_fastreg(mapping);
- if (ret)
- goto out;
-
- rds_iw_stats_inc(s_iw_rdma_mr_used);
-
-out:
- kfree(dma_pages);
-
- return ret;
-}
-
-/*
- * "Free" a fastreg MR.
- */
-static void rds_iw_free_fastreg(struct rds_iw_mr_pool *pool,
- struct rds_iw_mr *ibmr)
-{
- unsigned long flags;
- int ret;
-
- if (!ibmr->mapping.m_sg.dma_len)
- return;
-
- ret = rds_iw_rdma_fastreg_inv(ibmr);
- if (ret)
- return;
-
- /* Try to post the LOCAL_INV WR to the queue. */
- spin_lock_irqsave(&pool->list_lock, flags);
-
- list_add_tail(&ibmr->mapping.m_list, &pool->dirty_list);
- atomic_add(ibmr->mapping.m_sg.len, &pool->free_pinned);
- atomic_inc(&pool->dirty_count);
-
- spin_unlock_irqrestore(&pool->list_lock, flags);
-}
-
-static unsigned int rds_iw_unmap_fastreg_list(struct rds_iw_mr_pool *pool,
- struct list_head *unmap_list,
- struct list_head *kill_list,
- int *unpinned)
-{
- struct rds_iw_mapping *mapping, *next;
- unsigned int ncleaned = 0;
- LIST_HEAD(laundered);
-
- /* Batched invalidation of fastreg MRs.
- * Why do we do it this way, even though we could pipeline unmap
- * and remap? The reason is the application semantics - when the
- * application requests an invalidation of MRs, it expects all
- * previously released R_Keys to become invalid.
- *
- * If we implement MR reuse naively, we risk memory corruption
- * (this has actually been observed). So the default behavior
- * requires that a MR goes through an explicit unmap operation before
- * we can reuse it again.
- *
- * We could probably improve on this a little, by allowing immediate
- * reuse of a MR on the same socket (eg you could add small
- * cache of unused MRs to strct rds_socket - GET_MR could grab one
- * of these without requiring an explicit invalidate).
- */
- while (!list_empty(unmap_list)) {
- unsigned long flags;
-
- spin_lock_irqsave(&pool->list_lock, flags);
- list_for_each_entry_safe(mapping, next, unmap_list, m_list) {
- *unpinned += mapping->m_sg.len;
- list_move(&mapping->m_list, &laundered);
- ncleaned++;
- }
- spin_unlock_irqrestore(&pool->list_lock, flags);
- }
-
- /* Move all laundered mappings back to the unmap list.
- * We do not kill any WRs right now - it doesn't seem the
- * fastreg API has a max_remap limit. */
- list_splice_init(&laundered, unmap_list);
-
- return ncleaned;
-}
-
-static void rds_iw_destroy_fastreg(struct rds_iw_mr_pool *pool,
- struct rds_iw_mr *ibmr)
-{
- if (ibmr->page_list)
- ib_free_fast_reg_page_list(ibmr->page_list);
- if (ibmr->mr)
- ib_dereg_mr(ibmr->mr);
-}
diff --git a/net/rds/iw_recv.c b/net/rds/iw_recv.c
deleted file mode 100644
index a66d1794b2d0..000000000000
--- a/net/rds/iw_recv.c
+++ /dev/null
@@ -1,904 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/slab.h>
-#include <linux/pci.h>
-#include <linux/dma-mapping.h>
-#include <rdma/rdma_cm.h>
-
-#include "rds.h"
-#include "iw.h"
-
-static struct kmem_cache *rds_iw_incoming_slab;
-static struct kmem_cache *rds_iw_frag_slab;
-static atomic_t rds_iw_allocation = ATOMIC_INIT(0);
-
-static void rds_iw_frag_drop_page(struct rds_page_frag *frag)
-{
- rdsdebug("frag %p page %p\n", frag, frag->f_page);
- __free_page(frag->f_page);
- frag->f_page = NULL;
-}
-
-static void rds_iw_frag_free(struct rds_page_frag *frag)
-{
- rdsdebug("frag %p page %p\n", frag, frag->f_page);
- BUG_ON(frag->f_page);
- kmem_cache_free(rds_iw_frag_slab, frag);
-}
-
-/*
- * We map a page at a time. Its fragments are posted in order. This
- * is called in fragment order as the fragments get send completion events.
- * Only the last frag in the page performs the unmapping.
- *
- * It's OK for ring cleanup to call this in whatever order it likes because
- * DMA is not in flight and so we can unmap while other ring entries still
- * hold page references in their frags.
- */
-static void rds_iw_recv_unmap_page(struct rds_iw_connection *ic,
- struct rds_iw_recv_work *recv)
-{
- struct rds_page_frag *frag = recv->r_frag;
-
- rdsdebug("recv %p frag %p page %p\n", recv, frag, frag->f_page);
- if (frag->f_mapped)
- ib_dma_unmap_page(ic->i_cm_id->device,
- frag->f_mapped,
- RDS_FRAG_SIZE, DMA_FROM_DEVICE);
- frag->f_mapped = 0;
-}
-
-void rds_iw_recv_init_ring(struct rds_iw_connection *ic)
-{
- struct rds_iw_recv_work *recv;
- u32 i;
-
- for (i = 0, recv = ic->i_recvs; i < ic->i_recv_ring.w_nr; i++, recv++) {
- struct ib_sge *sge;
-
- recv->r_iwinc = NULL;
- recv->r_frag = NULL;
-
- recv->r_wr.next = NULL;
- recv->r_wr.wr_id = i;
- recv->r_wr.sg_list = recv->r_sge;
- recv->r_wr.num_sge = RDS_IW_RECV_SGE;
-
- sge = rds_iw_data_sge(ic, recv->r_sge);
- sge->addr = 0;
- sge->length = RDS_FRAG_SIZE;
- sge->lkey = 0;
-
- sge = rds_iw_header_sge(ic, recv->r_sge);
- sge->addr = ic->i_recv_hdrs_dma + (i * sizeof(struct rds_header));
- sge->length = sizeof(struct rds_header);
- sge->lkey = 0;
- }
-}
-
-static void rds_iw_recv_clear_one(struct rds_iw_connection *ic,
- struct rds_iw_recv_work *recv)
-{
- if (recv->r_iwinc) {
- rds_inc_put(&recv->r_iwinc->ii_inc);
- recv->r_iwinc = NULL;
- }
- if (recv->r_frag) {
- rds_iw_recv_unmap_page(ic, recv);
- if (recv->r_frag->f_page)
- rds_iw_frag_drop_page(recv->r_frag);
- rds_iw_frag_free(recv->r_frag);
- recv->r_frag = NULL;
- }
-}
-
-void rds_iw_recv_clear_ring(struct rds_iw_connection *ic)
-{
- u32 i;
-
- for (i = 0; i < ic->i_recv_ring.w_nr; i++)
- rds_iw_recv_clear_one(ic, &ic->i_recvs[i]);
-
- if (ic->i_frag.f_page)
- rds_iw_frag_drop_page(&ic->i_frag);
-}
-
-static int rds_iw_recv_refill_one(struct rds_connection *conn,
- struct rds_iw_recv_work *recv,
- gfp_t kptr_gfp, gfp_t page_gfp)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- dma_addr_t dma_addr;
- struct ib_sge *sge;
- int ret = -ENOMEM;
-
- if (!recv->r_iwinc) {
- if (!atomic_add_unless(&rds_iw_allocation, 1, rds_iw_sysctl_max_recv_allocation)) {
- rds_iw_stats_inc(s_iw_rx_alloc_limit);
- goto out;
- }
- recv->r_iwinc = kmem_cache_alloc(rds_iw_incoming_slab,
- kptr_gfp);
- if (!recv->r_iwinc) {
- atomic_dec(&rds_iw_allocation);
- goto out;
- }
- INIT_LIST_HEAD(&recv->r_iwinc->ii_frags);
- rds_inc_init(&recv->r_iwinc->ii_inc, conn, conn->c_faddr);
- }
-
- if (!recv->r_frag) {
- recv->r_frag = kmem_cache_alloc(rds_iw_frag_slab, kptr_gfp);
- if (!recv->r_frag)
- goto out;
- INIT_LIST_HEAD(&recv->r_frag->f_item);
- recv->r_frag->f_page = NULL;
- }
-
- if (!ic->i_frag.f_page) {
- ic->i_frag.f_page = alloc_page(page_gfp);
- if (!ic->i_frag.f_page)
- goto out;
- ic->i_frag.f_offset = 0;
- }
-
- dma_addr = ib_dma_map_page(ic->i_cm_id->device,
- ic->i_frag.f_page,
- ic->i_frag.f_offset,
- RDS_FRAG_SIZE,
- DMA_FROM_DEVICE);
- if (ib_dma_mapping_error(ic->i_cm_id->device, dma_addr))
- goto out;
-
- /*
- * Once we get the RDS_PAGE_LAST_OFF frag then rds_iw_frag_unmap()
- * must be called on this recv. This happens as completions hit
- * in order or on connection shutdown.
- */
- recv->r_frag->f_page = ic->i_frag.f_page;
- recv->r_frag->f_offset = ic->i_frag.f_offset;
- recv->r_frag->f_mapped = dma_addr;
-
- sge = rds_iw_data_sge(ic, recv->r_sge);
- sge->addr = dma_addr;
- sge->length = RDS_FRAG_SIZE;
-
- sge = rds_iw_header_sge(ic, recv->r_sge);
- sge->addr = ic->i_recv_hdrs_dma + (recv - ic->i_recvs) * sizeof(struct rds_header);
- sge->length = sizeof(struct rds_header);
-
- get_page(recv->r_frag->f_page);
-
- if (ic->i_frag.f_offset < RDS_PAGE_LAST_OFF) {
- ic->i_frag.f_offset += RDS_FRAG_SIZE;
- } else {
- put_page(ic->i_frag.f_page);
- ic->i_frag.f_page = NULL;
- ic->i_frag.f_offset = 0;
- }
-
- ret = 0;
-out:
- return ret;
-}
-
-/*
- * This tries to allocate and post unused work requests after making sure that
- * they have all the allocations they need to queue received fragments into
- * sockets. The i_recv_mutex is held here so that ring_alloc and _unalloc
- * pairs don't go unmatched.
- *
- * -1 is returned if posting fails due to temporary resource exhaustion.
- */
-int rds_iw_recv_refill(struct rds_connection *conn, gfp_t kptr_gfp,
- gfp_t page_gfp, int prefill)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct rds_iw_recv_work *recv;
- struct ib_recv_wr *failed_wr;
- unsigned int posted = 0;
- int ret = 0;
- u32 pos;
-
- while ((prefill || rds_conn_up(conn)) &&
- rds_iw_ring_alloc(&ic->i_recv_ring, 1, &pos)) {
- if (pos >= ic->i_recv_ring.w_nr) {
- printk(KERN_NOTICE "Argh - ring alloc returned pos=%u\n",
- pos);
- ret = -EINVAL;
- break;
- }
-
- recv = &ic->i_recvs[pos];
- ret = rds_iw_recv_refill_one(conn, recv, kptr_gfp, page_gfp);
- if (ret) {
- ret = -1;
- break;
- }
-
- /* XXX when can this fail? */
- ret = ib_post_recv(ic->i_cm_id->qp, &recv->r_wr, &failed_wr);
- rdsdebug("recv %p iwinc %p page %p addr %lu ret %d\n", recv,
- recv->r_iwinc, recv->r_frag->f_page,
- (long) recv->r_frag->f_mapped, ret);
- if (ret) {
- rds_iw_conn_error(conn, "recv post on "
- "%pI4 returned %d, disconnecting and "
- "reconnecting\n", &conn->c_faddr,
- ret);
- ret = -1;
- break;
- }
-
- posted++;
- }
-
- /* We're doing flow control - update the window. */
- if (ic->i_flowctl && posted)
- rds_iw_advertise_credits(conn, posted);
-
- if (ret)
- rds_iw_ring_unalloc(&ic->i_recv_ring, 1);
- return ret;
-}
-
-static void rds_iw_inc_purge(struct rds_incoming *inc)
-{
- struct rds_iw_incoming *iwinc;
- struct rds_page_frag *frag;
- struct rds_page_frag *pos;
-
- iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
- rdsdebug("purging iwinc %p inc %p\n", iwinc, inc);
-
- list_for_each_entry_safe(frag, pos, &iwinc->ii_frags, f_item) {
- list_del_init(&frag->f_item);
- rds_iw_frag_drop_page(frag);
- rds_iw_frag_free(frag);
- }
-}
-
-void rds_iw_inc_free(struct rds_incoming *inc)
-{
- struct rds_iw_incoming *iwinc;
-
- iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
-
- rds_iw_inc_purge(inc);
- rdsdebug("freeing iwinc %p inc %p\n", iwinc, inc);
- BUG_ON(!list_empty(&iwinc->ii_frags));
- kmem_cache_free(rds_iw_incoming_slab, iwinc);
- atomic_dec(&rds_iw_allocation);
- BUG_ON(atomic_read(&rds_iw_allocation) < 0);
-}
-
-int rds_iw_inc_copy_to_user(struct rds_incoming *inc, struct iov_iter *to)
-{
- struct rds_iw_incoming *iwinc;
- struct rds_page_frag *frag;
- unsigned long to_copy;
- unsigned long frag_off = 0;
- int copied = 0;
- int ret;
- u32 len;
-
- iwinc = container_of(inc, struct rds_iw_incoming, ii_inc);
- frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
- len = be32_to_cpu(inc->i_hdr.h_len);
-
- while (iov_iter_count(to) && copied < len) {
- if (frag_off == RDS_FRAG_SIZE) {
- frag = list_entry(frag->f_item.next,
- struct rds_page_frag, f_item);
- frag_off = 0;
- }
- to_copy = min_t(unsigned long, iov_iter_count(to),
- RDS_FRAG_SIZE - frag_off);
- to_copy = min_t(unsigned long, to_copy, len - copied);
-
- /* XXX needs + offset for multiple recvs per page */
- rds_stats_add(s_copy_to_user, to_copy);
- ret = copy_page_to_iter(frag->f_page,
- frag->f_offset + frag_off,
- to_copy,
- to);
- if (ret != to_copy)
- return -EFAULT;
-
- frag_off += to_copy;
- copied += to_copy;
- }
-
- return copied;
-}
-
-/* ic starts out kzalloc()ed */
-void rds_iw_recv_init_ack(struct rds_iw_connection *ic)
-{
- struct ib_send_wr *wr = &ic->i_ack_wr;
- struct ib_sge *sge = &ic->i_ack_sge;
-
- sge->addr = ic->i_ack_dma;
- sge->length = sizeof(struct rds_header);
- sge->lkey = rds_iw_local_dma_lkey(ic);
-
- wr->sg_list = sge;
- wr->num_sge = 1;
- wr->opcode = IB_WR_SEND;
- wr->wr_id = RDS_IW_ACK_WR_ID;
- wr->send_flags = IB_SEND_SIGNALED | IB_SEND_SOLICITED;
-}
-
-/*
- * You'd think that with reliable IB connections you wouldn't need to ack
- * messages that have been received. The problem is that IB hardware generates
- * an ack message before it has DMAed the message into memory. This creates a
- * potential message loss if the HCA is disabled for any reason between when it
- * sends the ack and before the message is DMAed and processed. This is only a
- * potential issue if another HCA is available for fail-over.
- *
- * When the remote host receives our ack they'll free the sent message from
- * their send queue. To decrease the latency of this we always send an ack
- * immediately after we've received messages.
- *
- * For simplicity, we only have one ack in flight at a time. This puts
- * pressure on senders to have deep enough send queues to absorb the latency of
- * a single ack frame being in flight. This might not be good enough.
- *
- * This is implemented by have a long-lived send_wr and sge which point to a
- * statically allocated ack frame. This ack wr does not fall under the ring
- * accounting that the tx and rx wrs do. The QP attribute specifically makes
- * room for it beyond the ring size. Send completion notices its special
- * wr_id and avoids working with the ring in that case.
- */
-#ifndef KERNEL_HAS_ATOMIC64
-static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
- int ack_required)
-{
- unsigned long flags;
-
- spin_lock_irqsave(&ic->i_ack_lock, flags);
- ic->i_ack_next = seq;
- if (ack_required)
- set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
- spin_unlock_irqrestore(&ic->i_ack_lock, flags);
-}
-
-static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
-{
- unsigned long flags;
- u64 seq;
-
- clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-
- spin_lock_irqsave(&ic->i_ack_lock, flags);
- seq = ic->i_ack_next;
- spin_unlock_irqrestore(&ic->i_ack_lock, flags);
-
- return seq;
-}
-#else
-static void rds_iw_set_ack(struct rds_iw_connection *ic, u64 seq,
- int ack_required)
-{
- atomic64_set(&ic->i_ack_next, seq);
- if (ack_required) {
- smp_mb__before_atomic();
- set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
- }
-}
-
-static u64 rds_iw_get_ack(struct rds_iw_connection *ic)
-{
- clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
- smp_mb__after_atomic();
-
- return atomic64_read(&ic->i_ack_next);
-}
-#endif
-
-
-static void rds_iw_send_ack(struct rds_iw_connection *ic, unsigned int adv_credits)
-{
- struct rds_header *hdr = ic->i_ack;
- struct ib_send_wr *failed_wr;
- u64 seq;
- int ret;
-
- seq = rds_iw_get_ack(ic);
-
- rdsdebug("send_ack: ic %p ack %llu\n", ic, (unsigned long long) seq);
- rds_message_populate_header(hdr, 0, 0, 0);
- hdr->h_ack = cpu_to_be64(seq);
- hdr->h_credit = adv_credits;
- rds_message_make_checksum(hdr);
- ic->i_ack_queued = jiffies;
-
- ret = ib_post_send(ic->i_cm_id->qp, &ic->i_ack_wr, &failed_wr);
- if (unlikely(ret)) {
- /* Failed to send. Release the WR, and
- * force another ACK.
- */
- clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
- set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-
- rds_iw_stats_inc(s_iw_ack_send_failure);
-
- rds_iw_conn_error(ic->conn, "sending ack failed\n");
- } else
- rds_iw_stats_inc(s_iw_ack_sent);
-}
-
-/*
- * There are 3 ways of getting acknowledgements to the peer:
- * 1. We call rds_iw_attempt_ack from the recv completion handler
- * to send an ACK-only frame.
- * However, there can be only one such frame in the send queue
- * at any time, so we may have to postpone it.
- * 2. When another (data) packet is transmitted while there's
- * an ACK in the queue, we piggyback the ACK sequence number
- * on the data packet.
- * 3. If the ACK WR is done sending, we get called from the
- * send queue completion handler, and check whether there's
- * another ACK pending (postponed because the WR was on the
- * queue). If so, we transmit it.
- *
- * We maintain 2 variables:
- * - i_ack_flags, which keeps track of whether the ACK WR
- * is currently in the send queue or not (IB_ACK_IN_FLIGHT)
- * - i_ack_next, which is the last sequence number we received
- *
- * Potentially, send queue and receive queue handlers can run concurrently.
- * It would be nice to not have to use a spinlock to synchronize things,
- * but the one problem that rules this out is that 64bit updates are
- * not atomic on all platforms. Things would be a lot simpler if
- * we had atomic64 or maybe cmpxchg64 everywhere.
- *
- * Reconnecting complicates this picture just slightly. When we
- * reconnect, we may be seeing duplicate packets. The peer
- * is retransmitting them, because it hasn't seen an ACK for
- * them. It is important that we ACK these.
- *
- * ACK mitigation adds a header flag "ACK_REQUIRED"; any packet with
- * this flag set *MUST* be acknowledged immediately.
- */
-
-/*
- * When we get here, we're called from the recv queue handler.
- * Check whether we ought to transmit an ACK.
- */
-void rds_iw_attempt_ack(struct rds_iw_connection *ic)
-{
- unsigned int adv_credits;
-
- if (!test_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
- return;
-
- if (test_and_set_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags)) {
- rds_iw_stats_inc(s_iw_ack_send_delayed);
- return;
- }
-
- /* Can we get a send credit? */
- if (!rds_iw_send_grab_credits(ic, 1, &adv_credits, 0, RDS_MAX_ADV_CREDIT)) {
- rds_iw_stats_inc(s_iw_tx_throttle);
- clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
- return;
- }
-
- clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
- rds_iw_send_ack(ic, adv_credits);
-}
-
-/*
- * We get here from the send completion handler, when the
- * adapter tells us the ACK frame was sent.
- */
-void rds_iw_ack_send_complete(struct rds_iw_connection *ic)
-{
- clear_bit(IB_ACK_IN_FLIGHT, &ic->i_ack_flags);
- rds_iw_attempt_ack(ic);
-}
-
-/*
- * This is called by the regular xmit code when it wants to piggyback
- * an ACK on an outgoing frame.
- */
-u64 rds_iw_piggyb_ack(struct rds_iw_connection *ic)
-{
- if (test_and_clear_bit(IB_ACK_REQUESTED, &ic->i_ack_flags))
- rds_iw_stats_inc(s_iw_ack_send_piggybacked);
- return rds_iw_get_ack(ic);
-}
-
-/*
- * It's kind of lame that we're copying from the posted receive pages into
- * long-lived bitmaps. We could have posted the bitmaps and rdma written into
- * them. But receiving new congestion bitmaps should be a *rare* event, so
- * hopefully we won't need to invest that complexity in making it more
- * efficient. By copying we can share a simpler core with TCP which has to
- * copy.
- */
-static void rds_iw_cong_recv(struct rds_connection *conn,
- struct rds_iw_incoming *iwinc)
-{
- struct rds_cong_map *map;
- unsigned int map_off;
- unsigned int map_page;
- struct rds_page_frag *frag;
- unsigned long frag_off;
- unsigned long to_copy;
- unsigned long copied;
- uint64_t uncongested = 0;
- void *addr;
-
- /* catch completely corrupt packets */
- if (be32_to_cpu(iwinc->ii_inc.i_hdr.h_len) != RDS_CONG_MAP_BYTES)
- return;
-
- map = conn->c_fcong;
- map_page = 0;
- map_off = 0;
-
- frag = list_entry(iwinc->ii_frags.next, struct rds_page_frag, f_item);
- frag_off = 0;
-
- copied = 0;
-
- while (copied < RDS_CONG_MAP_BYTES) {
- uint64_t *src, *dst;
- unsigned int k;
-
- to_copy = min(RDS_FRAG_SIZE - frag_off, PAGE_SIZE - map_off);
- BUG_ON(to_copy & 7); /* Must be 64bit aligned. */
-
- addr = kmap_atomic(frag->f_page);
-
- src = addr + frag_off;
- dst = (void *)map->m_page_addrs[map_page] + map_off;
- for (k = 0; k < to_copy; k += 8) {
- /* Record ports that became uncongested, ie
- * bits that changed from 0 to 1. */
- uncongested |= ~(*src) & *dst;
- *dst++ = *src++;
- }
- kunmap_atomic(addr);
-
- copied += to_copy;
-
- map_off += to_copy;
- if (map_off == PAGE_SIZE) {
- map_off = 0;
- map_page++;
- }
-
- frag_off += to_copy;
- if (frag_off == RDS_FRAG_SIZE) {
- frag = list_entry(frag->f_item.next,
- struct rds_page_frag, f_item);
- frag_off = 0;
- }
- }
-
- /* the congestion map is in little endian order */
- uncongested = le64_to_cpu(uncongested);
-
- rds_cong_map_updated(map, uncongested);
-}
-
-/*
- * Rings are posted with all the allocations they'll need to queue the
- * incoming message to the receiving socket so this can't fail.
- * All fragments start with a header, so we can make sure we're not receiving
- * garbage, and we can tell a small 8 byte fragment from an ACK frame.
- */
-struct rds_iw_ack_state {
- u64 ack_next;
- u64 ack_recv;
- unsigned int ack_required:1;
- unsigned int ack_next_valid:1;
- unsigned int ack_recv_valid:1;
-};
-
-static void rds_iw_process_recv(struct rds_connection *conn,
- struct rds_iw_recv_work *recv, u32 byte_len,
- struct rds_iw_ack_state *state)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct rds_iw_incoming *iwinc = ic->i_iwinc;
- struct rds_header *ihdr, *hdr;
-
- /* XXX shut down the connection if port 0,0 are seen? */
-
- rdsdebug("ic %p iwinc %p recv %p byte len %u\n", ic, iwinc, recv,
- byte_len);
-
- if (byte_len < sizeof(struct rds_header)) {
- rds_iw_conn_error(conn, "incoming message "
- "from %pI4 didn't include a "
- "header, disconnecting and "
- "reconnecting\n",
- &conn->c_faddr);
- return;
- }
- byte_len -= sizeof(struct rds_header);
-
- ihdr = &ic->i_recv_hdrs[recv - ic->i_recvs];
-
- /* Validate the checksum. */
- if (!rds_message_verify_checksum(ihdr)) {
- rds_iw_conn_error(conn, "incoming message "
- "from %pI4 has corrupted header - "
- "forcing a reconnect\n",
- &conn->c_faddr);
- rds_stats_inc(s_recv_drop_bad_checksum);
- return;
- }
-
- /* Process the ACK sequence which comes with every packet */
- state->ack_recv = be64_to_cpu(ihdr->h_ack);
- state->ack_recv_valid = 1;
-
- /* Process the credits update if there was one */
- if (ihdr->h_credit)
- rds_iw_send_add_credits(conn, ihdr->h_credit);
-
- if (ihdr->h_sport == 0 && ihdr->h_dport == 0 && byte_len == 0) {
- /* This is an ACK-only packet. The fact that it gets
- * special treatment here is that historically, ACKs
- * were rather special beasts.
- */
- rds_iw_stats_inc(s_iw_ack_received);
-
- /*
- * Usually the frags make their way on to incs and are then freed as
- * the inc is freed. We don't go that route, so we have to drop the
- * page ref ourselves. We can't just leave the page on the recv
- * because that confuses the dma mapping of pages and each recv's use
- * of a partial page. We can leave the frag, though, it will be
- * reused.
- *
- * FIXME: Fold this into the code path below.
- */
- rds_iw_frag_drop_page(recv->r_frag);
- return;
- }
-
- /*
- * If we don't already have an inc on the connection then this
- * fragment has a header and starts a message.. copy its header
- * into the inc and save the inc so we can hang upcoming fragments
- * off its list.
- */
- if (!iwinc) {
- iwinc = recv->r_iwinc;
- recv->r_iwinc = NULL;
- ic->i_iwinc = iwinc;
-
- hdr = &iwinc->ii_inc.i_hdr;
- memcpy(hdr, ihdr, sizeof(*hdr));
- ic->i_recv_data_rem = be32_to_cpu(hdr->h_len);
-
- rdsdebug("ic %p iwinc %p rem %u flag 0x%x\n", ic, iwinc,
- ic->i_recv_data_rem, hdr->h_flags);
- } else {
- hdr = &iwinc->ii_inc.i_hdr;
- /* We can't just use memcmp here; fragments of a
- * single message may carry different ACKs */
- if (hdr->h_sequence != ihdr->h_sequence ||
- hdr->h_len != ihdr->h_len ||
- hdr->h_sport != ihdr->h_sport ||
- hdr->h_dport != ihdr->h_dport) {
- rds_iw_conn_error(conn,
- "fragment header mismatch; forcing reconnect\n");
- return;
- }
- }
-
- list_add_tail(&recv->r_frag->f_item, &iwinc->ii_frags);
- recv->r_frag = NULL;
-
- if (ic->i_recv_data_rem > RDS_FRAG_SIZE)
- ic->i_recv_data_rem -= RDS_FRAG_SIZE;
- else {
- ic->i_recv_data_rem = 0;
- ic->i_iwinc = NULL;
-
- if (iwinc->ii_inc.i_hdr.h_flags == RDS_FLAG_CONG_BITMAP)
- rds_iw_cong_recv(conn, iwinc);
- else {
- rds_recv_incoming(conn, conn->c_faddr, conn->c_laddr,
- &iwinc->ii_inc, GFP_ATOMIC);
- state->ack_next = be64_to_cpu(hdr->h_sequence);
- state->ack_next_valid = 1;
- }
-
- /* Evaluate the ACK_REQUIRED flag *after* we received
- * the complete frame, and after bumping the next_rx
- * sequence. */
- if (hdr->h_flags & RDS_FLAG_ACK_REQUIRED) {
- rds_stats_inc(s_recv_ack_required);
- state->ack_required = 1;
- }
-
- rds_inc_put(&iwinc->ii_inc);
- }
-}
-
-/*
- * Plucking the oldest entry from the ring can be done concurrently with
- * the thread refilling the ring. Each ring operation is protected by
- * spinlocks and the transient state of refilling doesn't change the
- * recording of which entry is oldest.
- *
- * This relies on IB only calling one cq comp_handler for each cq so that
- * there will only be one caller of rds_recv_incoming() per RDS connection.
- */
-void rds_iw_recv_cq_comp_handler(struct ib_cq *cq, void *context)
-{
- struct rds_connection *conn = context;
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- rdsdebug("conn %p cq %p\n", conn, cq);
-
- rds_iw_stats_inc(s_iw_rx_cq_call);
-
- tasklet_schedule(&ic->i_recv_tasklet);
-}
-
-static inline void rds_poll_cq(struct rds_iw_connection *ic,
- struct rds_iw_ack_state *state)
-{
- struct rds_connection *conn = ic->conn;
- struct ib_wc wc;
- struct rds_iw_recv_work *recv;
-
- while (ib_poll_cq(ic->i_recv_cq, 1, &wc) > 0) {
- rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
- (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
- be32_to_cpu(wc.ex.imm_data));
- rds_iw_stats_inc(s_iw_rx_cq_event);
-
- recv = &ic->i_recvs[rds_iw_ring_oldest(&ic->i_recv_ring)];
-
- rds_iw_recv_unmap_page(ic, recv);
-
- /*
- * Also process recvs in connecting state because it is possible
- * to get a recv completion _before_ the rdmacm ESTABLISHED
- * event is processed.
- */
- if (rds_conn_up(conn) || rds_conn_connecting(conn)) {
- /* We expect errors as the qp is drained during shutdown */
- if (wc.status == IB_WC_SUCCESS) {
- rds_iw_process_recv(conn, recv, wc.byte_len, state);
- } else {
- rds_iw_conn_error(conn, "recv completion on "
- "%pI4 had status %u, disconnecting and "
- "reconnecting\n", &conn->c_faddr,
- wc.status);
- }
- }
-
- rds_iw_ring_free(&ic->i_recv_ring, 1);
- }
-}
-
-void rds_iw_recv_tasklet_fn(unsigned long data)
-{
- struct rds_iw_connection *ic = (struct rds_iw_connection *) data;
- struct rds_connection *conn = ic->conn;
- struct rds_iw_ack_state state = { 0, };
-
- rds_poll_cq(ic, &state);
- ib_req_notify_cq(ic->i_recv_cq, IB_CQ_SOLICITED);
- rds_poll_cq(ic, &state);
-
- if (state.ack_next_valid)
- rds_iw_set_ack(ic, state.ack_next, state.ack_required);
- if (state.ack_recv_valid && state.ack_recv > ic->i_ack_recv) {
- rds_send_drop_acked(conn, state.ack_recv, NULL);
- ic->i_ack_recv = state.ack_recv;
- }
- if (rds_conn_up(conn))
- rds_iw_attempt_ack(ic);
-
- /* If we ever end up with a really empty receive ring, we're
- * in deep trouble, as the sender will definitely see RNR
- * timeouts. */
- if (rds_iw_ring_empty(&ic->i_recv_ring))
- rds_iw_stats_inc(s_iw_rx_ring_empty);
-
- /*
- * If the ring is running low, then schedule the thread to refill.
- */
- if (rds_iw_ring_low(&ic->i_recv_ring))
- queue_delayed_work(rds_wq, &conn->c_recv_w, 0);
-}
-
-int rds_iw_recv(struct rds_connection *conn)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- int ret = 0;
-
- rdsdebug("conn %p\n", conn);
-
- /*
- * If we get a temporary posting failure in this context then
- * we're really low and we want the caller to back off for a bit.
- */
- mutex_lock(&ic->i_recv_mutex);
- if (rds_iw_recv_refill(conn, GFP_KERNEL, GFP_HIGHUSER, 0))
- ret = -ENOMEM;
- else
- rds_iw_stats_inc(s_iw_rx_refill_from_thread);
- mutex_unlock(&ic->i_recv_mutex);
-
- if (rds_conn_up(conn))
- rds_iw_attempt_ack(ic);
-
- return ret;
-}
-
-int rds_iw_recv_init(void)
-{
- struct sysinfo si;
- int ret = -ENOMEM;
-
- /* Default to 30% of all available RAM for recv memory */
- si_meminfo(&si);
- rds_iw_sysctl_max_recv_allocation = si.totalram / 3 * PAGE_SIZE / RDS_FRAG_SIZE;
-
- rds_iw_incoming_slab = kmem_cache_create("rds_iw_incoming",
- sizeof(struct rds_iw_incoming),
- 0, 0, NULL);
- if (!rds_iw_incoming_slab)
- goto out;
-
- rds_iw_frag_slab = kmem_cache_create("rds_iw_frag",
- sizeof(struct rds_page_frag),
- 0, 0, NULL);
- if (!rds_iw_frag_slab)
- kmem_cache_destroy(rds_iw_incoming_slab);
- else
- ret = 0;
-out:
- return ret;
-}
-
-void rds_iw_recv_exit(void)
-{
- kmem_cache_destroy(rds_iw_incoming_slab);
- kmem_cache_destroy(rds_iw_frag_slab);
-}
diff --git a/net/rds/iw_ring.c b/net/rds/iw_ring.c
deleted file mode 100644
index da8e3b63f663..000000000000
--- a/net/rds/iw_ring.c
+++ /dev/null
@@ -1,169 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-
-#include "rds.h"
-#include "iw.h"
-
-/*
- * Locking for IB rings.
- * We assume that allocation is always protected by a mutex
- * in the caller (this is a valid assumption for the current
- * implementation).
- *
- * Freeing always happens in an interrupt, and hence only
- * races with allocations, but not with other free()s.
- *
- * The interaction between allocation and freeing is that
- * the alloc code has to determine the number of free entries.
- * To this end, we maintain two counters; an allocation counter
- * and a free counter. Both are allowed to run freely, and wrap
- * around.
- * The number of used entries is always (alloc_ctr - free_ctr) % NR.
- *
- * The current implementation makes free_ctr atomic. When the
- * caller finds an allocation fails, it should set an "alloc fail"
- * bit and retry the allocation. The "alloc fail" bit essentially tells
- * the CQ completion handlers to wake it up after freeing some
- * more entries.
- */
-
-/*
- * This only happens on shutdown.
- */
-DECLARE_WAIT_QUEUE_HEAD(rds_iw_ring_empty_wait);
-
-void rds_iw_ring_init(struct rds_iw_work_ring *ring, u32 nr)
-{
- memset(ring, 0, sizeof(*ring));
- ring->w_nr = nr;
- rdsdebug("ring %p nr %u\n", ring, ring->w_nr);
-}
-
-static inline u32 __rds_iw_ring_used(struct rds_iw_work_ring *ring)
-{
- u32 diff;
-
- /* This assumes that atomic_t has at least as many bits as u32 */
- diff = ring->w_alloc_ctr - (u32) atomic_read(&ring->w_free_ctr);
- BUG_ON(diff > ring->w_nr);
-
- return diff;
-}
-
-void rds_iw_ring_resize(struct rds_iw_work_ring *ring, u32 nr)
-{
- /* We only ever get called from the connection setup code,
- * prior to creating the QP. */
- BUG_ON(__rds_iw_ring_used(ring));
- ring->w_nr = nr;
-}
-
-static int __rds_iw_ring_empty(struct rds_iw_work_ring *ring)
-{
- return __rds_iw_ring_used(ring) == 0;
-}
-
-u32 rds_iw_ring_alloc(struct rds_iw_work_ring *ring, u32 val, u32 *pos)
-{
- u32 ret = 0, avail;
-
- avail = ring->w_nr - __rds_iw_ring_used(ring);
-
- rdsdebug("ring %p val %u next %u free %u\n", ring, val,
- ring->w_alloc_ptr, avail);
-
- if (val && avail) {
- ret = min(val, avail);
- *pos = ring->w_alloc_ptr;
-
- ring->w_alloc_ptr = (ring->w_alloc_ptr + ret) % ring->w_nr;
- ring->w_alloc_ctr += ret;
- }
-
- return ret;
-}
-
-void rds_iw_ring_free(struct rds_iw_work_ring *ring, u32 val)
-{
- ring->w_free_ptr = (ring->w_free_ptr + val) % ring->w_nr;
- atomic_add(val, &ring->w_free_ctr);
-
- if (__rds_iw_ring_empty(ring) &&
- waitqueue_active(&rds_iw_ring_empty_wait))
- wake_up(&rds_iw_ring_empty_wait);
-}
-
-void rds_iw_ring_unalloc(struct rds_iw_work_ring *ring, u32 val)
-{
- ring->w_alloc_ptr = (ring->w_alloc_ptr - val) % ring->w_nr;
- ring->w_alloc_ctr -= val;
-}
-
-int rds_iw_ring_empty(struct rds_iw_work_ring *ring)
-{
- return __rds_iw_ring_empty(ring);
-}
-
-int rds_iw_ring_low(struct rds_iw_work_ring *ring)
-{
- return __rds_iw_ring_used(ring) <= (ring->w_nr >> 1);
-}
-
-
-/*
- * returns the oldest alloced ring entry. This will be the next one
- * freed. This can't be called if there are none allocated.
- */
-u32 rds_iw_ring_oldest(struct rds_iw_work_ring *ring)
-{
- return ring->w_free_ptr;
-}
-
-/*
- * returns the number of completed work requests.
- */
-
-u32 rds_iw_ring_completed(struct rds_iw_work_ring *ring, u32 wr_id, u32 oldest)
-{
- u32 ret;
-
- if (oldest <= (unsigned long long)wr_id)
- ret = (unsigned long long)wr_id - oldest + 1;
- else
- ret = ring->w_nr - oldest + (unsigned long long)wr_id + 1;
-
- rdsdebug("ring %p ret %u wr_id %u oldest %u\n", ring, ret,
- wr_id, oldest);
- return ret;
-}
diff --git a/net/rds/iw_send.c b/net/rds/iw_send.c
deleted file mode 100644
index 86152ec3b887..000000000000
--- a/net/rds/iw_send.c
+++ /dev/null
@@ -1,979 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/in.h>
-#include <linux/device.h>
-#include <linux/dmapool.h>
-#include <linux/ratelimit.h>
-
-#include "rds.h"
-#include "iw.h"
-
-static void rds_iw_send_rdma_complete(struct rds_message *rm,
- int wc_status)
-{
- int notify_status;
-
- switch (wc_status) {
- case IB_WC_WR_FLUSH_ERR:
- return;
-
- case IB_WC_SUCCESS:
- notify_status = RDS_RDMA_SUCCESS;
- break;
-
- case IB_WC_REM_ACCESS_ERR:
- notify_status = RDS_RDMA_REMOTE_ERROR;
- break;
-
- default:
- notify_status = RDS_RDMA_OTHER_ERROR;
- break;
- }
- rds_rdma_send_complete(rm, notify_status);
-}
-
-static void rds_iw_send_unmap_rdma(struct rds_iw_connection *ic,
- struct rm_rdma_op *op)
-{
- if (op->op_mapped) {
- ib_dma_unmap_sg(ic->i_cm_id->device,
- op->op_sg, op->op_nents,
- op->op_write ? DMA_TO_DEVICE : DMA_FROM_DEVICE);
- op->op_mapped = 0;
- }
-}
-
-static void rds_iw_send_unmap_rm(struct rds_iw_connection *ic,
- struct rds_iw_send_work *send,
- int wc_status)
-{
- struct rds_message *rm = send->s_rm;
-
- rdsdebug("ic %p send %p rm %p\n", ic, send, rm);
-
- ib_dma_unmap_sg(ic->i_cm_id->device,
- rm->data.op_sg, rm->data.op_nents,
- DMA_TO_DEVICE);
-
- if (rm->rdma.op_active) {
- rds_iw_send_unmap_rdma(ic, &rm->rdma);
-
- /* If the user asked for a completion notification on this
- * message, we can implement three different semantics:
- * 1. Notify when we received the ACK on the RDS message
- * that was queued with the RDMA. This provides reliable
- * notification of RDMA status at the expense of a one-way
- * packet delay.
- * 2. Notify when the IB stack gives us the completion event for
- * the RDMA operation.
- * 3. Notify when the IB stack gives us the completion event for
- * the accompanying RDS messages.
- * Here, we implement approach #3. To implement approach #2,
- * call rds_rdma_send_complete from the cq_handler. To implement #1,
- * don't call rds_rdma_send_complete at all, and fall back to the notify
- * handling in the ACK processing code.
- *
- * Note: There's no need to explicitly sync any RDMA buffers using
- * ib_dma_sync_sg_for_cpu - the completion for the RDMA
- * operation itself unmapped the RDMA buffers, which takes care
- * of synching.
- */
- rds_iw_send_rdma_complete(rm, wc_status);
-
- if (rm->rdma.op_write)
- rds_stats_add(s_send_rdma_bytes, rm->rdma.op_bytes);
- else
- rds_stats_add(s_recv_rdma_bytes, rm->rdma.op_bytes);
- }
-
- /* If anyone waited for this message to get flushed out, wake
- * them up now */
- rds_message_unmapped(rm);
-
- rds_message_put(rm);
- send->s_rm = NULL;
-}
-
-void rds_iw_send_init_ring(struct rds_iw_connection *ic)
-{
- struct rds_iw_send_work *send;
- u32 i;
-
- for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
- struct ib_sge *sge;
-
- send->s_rm = NULL;
- send->s_op = NULL;
- send->s_mapping = NULL;
-
- send->s_wr.next = NULL;
- send->s_wr.wr_id = i;
- send->s_wr.sg_list = send->s_sge;
- send->s_wr.num_sge = 1;
- send->s_wr.opcode = IB_WR_SEND;
- send->s_wr.send_flags = 0;
- send->s_wr.ex.imm_data = 0;
-
- sge = rds_iw_data_sge(ic, send->s_sge);
- sge->lkey = 0;
-
- sge = rds_iw_header_sge(ic, send->s_sge);
- sge->addr = ic->i_send_hdrs_dma + (i * sizeof(struct rds_header));
- sge->length = sizeof(struct rds_header);
- sge->lkey = 0;
-
- send->s_mr = ib_alloc_mr(ic->i_pd, IB_MR_TYPE_MEM_REG,
- fastreg_message_size);
- if (IS_ERR(send->s_mr)) {
- printk(KERN_WARNING "RDS/IW: ib_alloc_mr failed\n");
- break;
- }
-
- send->s_page_list = ib_alloc_fast_reg_page_list(
- ic->i_cm_id->device, fastreg_message_size);
- if (IS_ERR(send->s_page_list)) {
- printk(KERN_WARNING "RDS/IW: ib_alloc_fast_reg_page_list failed\n");
- break;
- }
- }
-}
-
-void rds_iw_send_clear_ring(struct rds_iw_connection *ic)
-{
- struct rds_iw_send_work *send;
- u32 i;
-
- for (i = 0, send = ic->i_sends; i < ic->i_send_ring.w_nr; i++, send++) {
- BUG_ON(!send->s_mr);
- ib_dereg_mr(send->s_mr);
- BUG_ON(!send->s_page_list);
- ib_free_fast_reg_page_list(send->s_page_list);
- if (send->s_wr.opcode == 0xdead)
- continue;
- if (send->s_rm)
- rds_iw_send_unmap_rm(ic, send, IB_WC_WR_FLUSH_ERR);
- if (send->s_op)
- rds_iw_send_unmap_rdma(ic, send->s_op);
- }
-}
-
-/*
- * The _oldest/_free ring operations here race cleanly with the alloc/unalloc
- * operations performed in the send path. As the sender allocs and potentially
- * unallocs the next free entry in the ring it doesn't alter which is
- * the next to be freed, which is what this is concerned with.
- */
-void rds_iw_send_cq_comp_handler(struct ib_cq *cq, void *context)
-{
- struct rds_connection *conn = context;
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct ib_wc wc;
- struct rds_iw_send_work *send;
- u32 completed;
- u32 oldest;
- u32 i;
- int ret;
-
- rdsdebug("cq %p conn %p\n", cq, conn);
- rds_iw_stats_inc(s_iw_tx_cq_call);
- ret = ib_req_notify_cq(cq, IB_CQ_NEXT_COMP);
- if (ret)
- rdsdebug("ib_req_notify_cq send failed: %d\n", ret);
-
- while (ib_poll_cq(cq, 1, &wc) > 0) {
- rdsdebug("wc wr_id 0x%llx status %u byte_len %u imm_data %u\n",
- (unsigned long long)wc.wr_id, wc.status, wc.byte_len,
- be32_to_cpu(wc.ex.imm_data));
- rds_iw_stats_inc(s_iw_tx_cq_event);
-
- if (wc.status != IB_WC_SUCCESS) {
- printk(KERN_ERR "WC Error: status = %d opcode = %d\n", wc.status, wc.opcode);
- break;
- }
-
- if (wc.opcode == IB_WC_LOCAL_INV && wc.wr_id == RDS_IW_LOCAL_INV_WR_ID) {
- ic->i_fastreg_posted = 0;
- continue;
- }
-
- if (wc.opcode == IB_WC_FAST_REG_MR && wc.wr_id == RDS_IW_FAST_REG_WR_ID) {
- ic->i_fastreg_posted = 1;
- continue;
- }
-
- if (wc.wr_id == RDS_IW_ACK_WR_ID) {
- if (time_after(jiffies, ic->i_ack_queued + HZ/2))
- rds_iw_stats_inc(s_iw_tx_stalled);
- rds_iw_ack_send_complete(ic);
- continue;
- }
-
- oldest = rds_iw_ring_oldest(&ic->i_send_ring);
-
- completed = rds_iw_ring_completed(&ic->i_send_ring, wc.wr_id, oldest);
-
- for (i = 0; i < completed; i++) {
- send = &ic->i_sends[oldest];
-
- /* In the error case, wc.opcode sometimes contains garbage */
- switch (send->s_wr.opcode) {
- case IB_WR_SEND:
- if (send->s_rm)
- rds_iw_send_unmap_rm(ic, send, wc.status);
- break;
- case IB_WR_FAST_REG_MR:
- case IB_WR_RDMA_WRITE:
- case IB_WR_RDMA_READ:
- case IB_WR_RDMA_READ_WITH_INV:
- /* Nothing to be done - the SG list will be unmapped
- * when the SEND completes. */
- break;
- default:
- printk_ratelimited(KERN_NOTICE
- "RDS/IW: %s: unexpected opcode 0x%x in WR!\n",
- __func__, send->s_wr.opcode);
- break;
- }
-
- send->s_wr.opcode = 0xdead;
- send->s_wr.num_sge = 1;
- if (time_after(jiffies, send->s_queued + HZ/2))
- rds_iw_stats_inc(s_iw_tx_stalled);
-
- /* If a RDMA operation produced an error, signal this right
- * away. If we don't, the subsequent SEND that goes with this
- * RDMA will be canceled with ERR_WFLUSH, and the application
- * never learn that the RDMA failed. */
- if (unlikely(wc.status == IB_WC_REM_ACCESS_ERR && send->s_op)) {
- struct rds_message *rm;
-
- rm = rds_send_get_message(conn, send->s_op);
- if (rm)
- rds_iw_send_rdma_complete(rm, wc.status);
- }
-
- oldest = (oldest + 1) % ic->i_send_ring.w_nr;
- }
-
- rds_iw_ring_free(&ic->i_send_ring, completed);
-
- if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags) ||
- test_bit(0, &conn->c_map_queued))
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
-
- /* We expect errors as the qp is drained during shutdown */
- if (wc.status != IB_WC_SUCCESS && rds_conn_up(conn)) {
- rds_iw_conn_error(conn,
- "send completion on %pI4 "
- "had status %u, disconnecting and reconnecting\n",
- &conn->c_faddr, wc.status);
- }
- }
-}
-
-/*
- * This is the main function for allocating credits when sending
- * messages.
- *
- * Conceptually, we have two counters:
- * - send credits: this tells us how many WRs we're allowed
- * to submit without overruning the receiver's queue. For
- * each SEND WR we post, we decrement this by one.
- *
- * - posted credits: this tells us how many WRs we recently
- * posted to the receive queue. This value is transferred
- * to the peer as a "credit update" in a RDS header field.
- * Every time we transmit credits to the peer, we subtract
- * the amount of transferred credits from this counter.
- *
- * It is essential that we avoid situations where both sides have
- * exhausted their send credits, and are unable to send new credits
- * to the peer. We achieve this by requiring that we send at least
- * one credit update to the peer before exhausting our credits.
- * When new credits arrive, we subtract one credit that is withheld
- * until we've posted new buffers and are ready to transmit these
- * credits (see rds_iw_send_add_credits below).
- *
- * The RDS send code is essentially single-threaded; rds_send_xmit
- * grabs c_send_lock to ensure exclusive access to the send ring.
- * However, the ACK sending code is independent and can race with
- * message SENDs.
- *
- * In the send path, we need to update the counters for send credits
- * and the counter of posted buffers atomically - when we use the
- * last available credit, we cannot allow another thread to race us
- * and grab the posted credits counter. Hence, we have to use a
- * spinlock to protect the credit counter, or use atomics.
- *
- * Spinlocks shared between the send and the receive path are bad,
- * because they create unnecessary delays. An early implementation
- * using a spinlock showed a 5% degradation in throughput at some
- * loads.
- *
- * This implementation avoids spinlocks completely, putting both
- * counters into a single atomic, and updating that atomic using
- * atomic_add (in the receive path, when receiving fresh credits),
- * and using atomic_cmpxchg when updating the two counters.
- */
-int rds_iw_send_grab_credits(struct rds_iw_connection *ic,
- u32 wanted, u32 *adv_credits, int need_posted, int max_posted)
-{
- unsigned int avail, posted, got = 0, advertise;
- long oldval, newval;
-
- *adv_credits = 0;
- if (!ic->i_flowctl)
- return wanted;
-
-try_again:
- advertise = 0;
- oldval = newval = atomic_read(&ic->i_credits);
- posted = IB_GET_POST_CREDITS(oldval);
- avail = IB_GET_SEND_CREDITS(oldval);
-
- rdsdebug("wanted=%u credits=%u posted=%u\n",
- wanted, avail, posted);
-
- /* The last credit must be used to send a credit update. */
- if (avail && !posted)
- avail--;
-
- if (avail < wanted) {
- struct rds_connection *conn = ic->i_cm_id->context;
-
- /* Oops, there aren't that many credits left! */
- set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
- got = avail;
- } else {
- /* Sometimes you get what you want, lalala. */
- got = wanted;
- }
- newval -= IB_SET_SEND_CREDITS(got);
-
- /*
- * If need_posted is non-zero, then the caller wants
- * the posted regardless of whether any send credits are
- * available.
- */
- if (posted && (got || need_posted)) {
- advertise = min_t(unsigned int, posted, max_posted);
- newval -= IB_SET_POST_CREDITS(advertise);
- }
-
- /* Finally bill everything */
- if (atomic_cmpxchg(&ic->i_credits, oldval, newval) != oldval)
- goto try_again;
-
- *adv_credits = advertise;
- return got;
-}
-
-void rds_iw_send_add_credits(struct rds_connection *conn, unsigned int credits)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- if (credits == 0)
- return;
-
- rdsdebug("credits=%u current=%u%s\n",
- credits,
- IB_GET_SEND_CREDITS(atomic_read(&ic->i_credits)),
- test_bit(RDS_LL_SEND_FULL, &conn->c_flags) ? ", ll_send_full" : "");
-
- atomic_add(IB_SET_SEND_CREDITS(credits), &ic->i_credits);
- if (test_and_clear_bit(RDS_LL_SEND_FULL, &conn->c_flags))
- queue_delayed_work(rds_wq, &conn->c_send_w, 0);
-
- WARN_ON(IB_GET_SEND_CREDITS(credits) >= 16384);
-
- rds_iw_stats_inc(s_iw_rx_credit_updates);
-}
-
-void rds_iw_advertise_credits(struct rds_connection *conn, unsigned int posted)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- if (posted == 0)
- return;
-
- atomic_add(IB_SET_POST_CREDITS(posted), &ic->i_credits);
-
- /* Decide whether to send an update to the peer now.
- * If we would send a credit update for every single buffer we
- * post, we would end up with an ACK storm (ACK arrives,
- * consumes buffer, we refill the ring, send ACK to remote
- * advertising the newly posted buffer... ad inf)
- *
- * Performance pretty much depends on how often we send
- * credit updates - too frequent updates mean lots of ACKs.
- * Too infrequent updates, and the peer will run out of
- * credits and has to throttle.
- * For the time being, 16 seems to be a good compromise.
- */
- if (IB_GET_POST_CREDITS(atomic_read(&ic->i_credits)) >= 16)
- set_bit(IB_ACK_REQUESTED, &ic->i_ack_flags);
-}
-
-static inline void
-rds_iw_xmit_populate_wr(struct rds_iw_connection *ic,
- struct rds_iw_send_work *send, unsigned int pos,
- unsigned long buffer, unsigned int length,
- int send_flags)
-{
- struct ib_sge *sge;
-
- WARN_ON(pos != send - ic->i_sends);
-
- send->s_wr.send_flags = send_flags;
- send->s_wr.opcode = IB_WR_SEND;
- send->s_wr.num_sge = 2;
- send->s_wr.next = NULL;
- send->s_queued = jiffies;
- send->s_op = NULL;
-
- if (length != 0) {
- sge = rds_iw_data_sge(ic, send->s_sge);
- sge->addr = buffer;
- sge->length = length;
- sge->lkey = rds_iw_local_dma_lkey(ic);
-
- sge = rds_iw_header_sge(ic, send->s_sge);
- } else {
- /* We're sending a packet with no payload. There is only
- * one SGE */
- send->s_wr.num_sge = 1;
- sge = &send->s_sge[0];
- }
-
- sge->addr = ic->i_send_hdrs_dma + (pos * sizeof(struct rds_header));
- sge->length = sizeof(struct rds_header);
- sge->lkey = rds_iw_local_dma_lkey(ic);
-}
-
-/*
- * This can be called multiple times for a given message. The first time
- * we see a message we map its scatterlist into the IB device so that
- * we can provide that mapped address to the IB scatter gather entries
- * in the IB work requests. We translate the scatterlist into a series
- * of work requests that fragment the message. These work requests complete
- * in order so we pass ownership of the message to the completion handler
- * once we send the final fragment.
- *
- * The RDS core uses the c_send_lock to only enter this function once
- * per connection. This makes sure that the tx ring alloc/unalloc pairs
- * don't get out of sync and confuse the ring.
- */
-int rds_iw_xmit(struct rds_connection *conn, struct rds_message *rm,
- unsigned int hdr_off, unsigned int sg, unsigned int off)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct ib_device *dev = ic->i_cm_id->device;
- struct rds_iw_send_work *send = NULL;
- struct rds_iw_send_work *first;
- struct rds_iw_send_work *prev;
- struct ib_send_wr *failed_wr;
- struct scatterlist *scat;
- u32 pos;
- u32 i;
- u32 work_alloc;
- u32 credit_alloc;
- u32 posted;
- u32 adv_credits = 0;
- int send_flags = 0;
- int sent;
- int ret;
- int flow_controlled = 0;
-
- BUG_ON(off % RDS_FRAG_SIZE);
- BUG_ON(hdr_off != 0 && hdr_off != sizeof(struct rds_header));
-
- /* Fastreg support */
- if (rds_rdma_cookie_key(rm->m_rdma_cookie) && !ic->i_fastreg_posted) {
- ret = -EAGAIN;
- goto out;
- }
-
- /* FIXME we may overallocate here */
- if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0)
- i = 1;
- else
- i = ceil(be32_to_cpu(rm->m_inc.i_hdr.h_len), RDS_FRAG_SIZE);
-
- work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
- if (work_alloc == 0) {
- set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
- rds_iw_stats_inc(s_iw_tx_ring_full);
- ret = -ENOMEM;
- goto out;
- }
-
- credit_alloc = work_alloc;
- if (ic->i_flowctl) {
- credit_alloc = rds_iw_send_grab_credits(ic, work_alloc, &posted, 0, RDS_MAX_ADV_CREDIT);
- adv_credits += posted;
- if (credit_alloc < work_alloc) {
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - credit_alloc);
- work_alloc = credit_alloc;
- flow_controlled++;
- }
- if (work_alloc == 0) {
- set_bit(RDS_LL_SEND_FULL, &conn->c_flags);
- rds_iw_stats_inc(s_iw_tx_throttle);
- ret = -ENOMEM;
- goto out;
- }
- }
-
- /* map the message the first time we see it */
- if (!ic->i_rm) {
- /*
- printk(KERN_NOTICE "rds_iw_xmit prep msg dport=%u flags=0x%x len=%d\n",
- be16_to_cpu(rm->m_inc.i_hdr.h_dport),
- rm->m_inc.i_hdr.h_flags,
- be32_to_cpu(rm->m_inc.i_hdr.h_len));
- */
- if (rm->data.op_nents) {
- rm->data.op_count = ib_dma_map_sg(dev,
- rm->data.op_sg,
- rm->data.op_nents,
- DMA_TO_DEVICE);
- rdsdebug("ic %p mapping rm %p: %d\n", ic, rm, rm->data.op_count);
- if (rm->data.op_count == 0) {
- rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
- ret = -ENOMEM; /* XXX ? */
- goto out;
- }
- } else {
- rm->data.op_count = 0;
- }
-
- ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
- ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
- rds_message_addref(rm);
- rm->data.op_dmasg = 0;
- rm->data.op_dmaoff = 0;
- ic->i_rm = rm;
-
- /* Finalize the header */
- if (test_bit(RDS_MSG_ACK_REQUIRED, &rm->m_flags))
- rm->m_inc.i_hdr.h_flags |= RDS_FLAG_ACK_REQUIRED;
- if (test_bit(RDS_MSG_RETRANSMITTED, &rm->m_flags))
- rm->m_inc.i_hdr.h_flags |= RDS_FLAG_RETRANSMITTED;
-
- /* If it has a RDMA op, tell the peer we did it. This is
- * used by the peer to release use-once RDMA MRs. */
- if (rm->rdma.op_active) {
- struct rds_ext_header_rdma ext_hdr;
-
- ext_hdr.h_rdma_rkey = cpu_to_be32(rm->rdma.op_rkey);
- rds_message_add_extension(&rm->m_inc.i_hdr,
- RDS_EXTHDR_RDMA, &ext_hdr, sizeof(ext_hdr));
- }
- if (rm->m_rdma_cookie) {
- rds_message_add_rdma_dest_extension(&rm->m_inc.i_hdr,
- rds_rdma_cookie_key(rm->m_rdma_cookie),
- rds_rdma_cookie_offset(rm->m_rdma_cookie));
- }
-
- /* Note - rds_iw_piggyb_ack clears the ACK_REQUIRED bit, so
- * we should not do this unless we have a chance of at least
- * sticking the header into the send ring. Which is why we
- * should call rds_iw_ring_alloc first. */
- rm->m_inc.i_hdr.h_ack = cpu_to_be64(rds_iw_piggyb_ack(ic));
- rds_message_make_checksum(&rm->m_inc.i_hdr);
-
- /*
- * Update adv_credits since we reset the ACK_REQUIRED bit.
- */
- rds_iw_send_grab_credits(ic, 0, &posted, 1, RDS_MAX_ADV_CREDIT - adv_credits);
- adv_credits += posted;
- BUG_ON(adv_credits > 255);
- }
-
- send = &ic->i_sends[pos];
- first = send;
- prev = NULL;
- scat = &rm->data.op_sg[rm->data.op_dmasg];
- sent = 0;
- i = 0;
-
- /* Sometimes you want to put a fence between an RDMA
- * READ and the following SEND.
- * We could either do this all the time
- * or when requested by the user. Right now, we let
- * the application choose.
- */
- if (rm->rdma.op_active && rm->rdma.op_fence)
- send_flags = IB_SEND_FENCE;
-
- /*
- * We could be copying the header into the unused tail of the page.
- * That would need to be changed in the future when those pages might
- * be mapped userspace pages or page cache pages. So instead we always
- * use a second sge and our long-lived ring of mapped headers. We send
- * the header after the data so that the data payload can be aligned on
- * the receiver.
- */
-
- /* handle a 0-len message */
- if (be32_to_cpu(rm->m_inc.i_hdr.h_len) == 0) {
- rds_iw_xmit_populate_wr(ic, send, pos, 0, 0, send_flags);
- goto add_header;
- }
-
- /* if there's data reference it with a chain of work reqs */
- for (; i < work_alloc && scat != &rm->data.op_sg[rm->data.op_count]; i++) {
- unsigned int len;
-
- send = &ic->i_sends[pos];
-
- len = min(RDS_FRAG_SIZE,
- ib_sg_dma_len(dev, scat) - rm->data.op_dmaoff);
- rds_iw_xmit_populate_wr(ic, send, pos,
- ib_sg_dma_address(dev, scat) + rm->data.op_dmaoff, len,
- send_flags);
-
- /*
- * We want to delay signaling completions just enough to get
- * the batching benefits but not so much that we create dead time
- * on the wire.
- */
- if (ic->i_unsignaled_wrs-- == 0) {
- ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
- send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
- }
-
- ic->i_unsignaled_bytes -= len;
- if (ic->i_unsignaled_bytes <= 0) {
- ic->i_unsignaled_bytes = rds_iw_sysctl_max_unsig_bytes;
- send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
- }
-
- /*
- * Always signal the last one if we're stopping due to flow control.
- */
- if (flow_controlled && i == (work_alloc-1))
- send->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
-
- rdsdebug("send %p wr %p num_sge %u next %p\n", send,
- &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
-
- sent += len;
- rm->data.op_dmaoff += len;
- if (rm->data.op_dmaoff == ib_sg_dma_len(dev, scat)) {
- scat++;
- rm->data.op_dmaoff = 0;
- rm->data.op_dmasg++;
- }
-
-add_header:
- /* Tack on the header after the data. The header SGE should already
- * have been set up to point to the right header buffer. */
- memcpy(&ic->i_send_hdrs[pos], &rm->m_inc.i_hdr, sizeof(struct rds_header));
-
- if (0) {
- struct rds_header *hdr = &ic->i_send_hdrs[pos];
-
- printk(KERN_NOTICE "send WR dport=%u flags=0x%x len=%d\n",
- be16_to_cpu(hdr->h_dport),
- hdr->h_flags,
- be32_to_cpu(hdr->h_len));
- }
- if (adv_credits) {
- struct rds_header *hdr = &ic->i_send_hdrs[pos];
-
- /* add credit and redo the header checksum */
- hdr->h_credit = adv_credits;
- rds_message_make_checksum(hdr);
- adv_credits = 0;
- rds_iw_stats_inc(s_iw_tx_credit_updates);
- }
-
- if (prev)
- prev->s_wr.next = &send->s_wr;
- prev = send;
-
- pos = (pos + 1) % ic->i_send_ring.w_nr;
- }
-
- /* Account the RDS header in the number of bytes we sent, but just once.
- * The caller has no concept of fragmentation. */
- if (hdr_off == 0)
- sent += sizeof(struct rds_header);
-
- /* if we finished the message then send completion owns it */
- if (scat == &rm->data.op_sg[rm->data.op_count]) {
- prev->s_rm = ic->i_rm;
- prev->s_wr.send_flags |= IB_SEND_SIGNALED | IB_SEND_SOLICITED;
- ic->i_rm = NULL;
- }
-
- if (i < work_alloc) {
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
- work_alloc = i;
- }
- if (ic->i_flowctl && i < credit_alloc)
- rds_iw_send_add_credits(conn, credit_alloc - i);
-
- /* XXX need to worry about failed_wr and partial sends. */
- failed_wr = &first->s_wr;
- ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
- rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
- first, &first->s_wr, ret, failed_wr);
- BUG_ON(failed_wr != &first->s_wr);
- if (ret) {
- printk(KERN_WARNING "RDS/IW: ib_post_send to %pI4 "
- "returned %d\n", &conn->c_faddr, ret);
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
- if (prev->s_rm) {
- ic->i_rm = prev->s_rm;
- prev->s_rm = NULL;
- }
- goto out;
- }
-
- ret = sent;
-out:
- BUG_ON(adv_credits);
- return ret;
-}
-
-static void rds_iw_build_send_fastreg(struct rds_iw_device *rds_iwdev, struct rds_iw_connection *ic, struct rds_iw_send_work *send, int nent, int len, u64 sg_addr)
-{
- BUG_ON(nent > send->s_page_list->max_page_list_len);
- /*
- * Perform a WR for the fast_reg_mr. Each individual page
- * in the sg list is added to the fast reg page list and placed
- * inside the fast_reg_mr WR.
- */
- send->s_wr.opcode = IB_WR_FAST_REG_MR;
- send->s_wr.wr.fast_reg.length = len;
- send->s_wr.wr.fast_reg.rkey = send->s_mr->rkey;
- send->s_wr.wr.fast_reg.page_list = send->s_page_list;
- send->s_wr.wr.fast_reg.page_list_len = nent;
- send->s_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- send->s_wr.wr.fast_reg.access_flags = IB_ACCESS_REMOTE_WRITE;
- send->s_wr.wr.fast_reg.iova_start = sg_addr;
-
- ib_update_fast_reg_key(send->s_mr, send->s_remap_count++);
-}
-
-int rds_iw_xmit_rdma(struct rds_connection *conn, struct rm_rdma_op *op)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
- struct rds_iw_send_work *send = NULL;
- struct rds_iw_send_work *first;
- struct rds_iw_send_work *prev;
- struct ib_send_wr *failed_wr;
- struct rds_iw_device *rds_iwdev;
- struct scatterlist *scat;
- unsigned long len;
- u64 remote_addr = op->op_remote_addr;
- u32 pos, fr_pos;
- u32 work_alloc;
- u32 i;
- u32 j;
- int sent;
- int ret;
- int num_sge;
-
- rds_iwdev = ib_get_client_data(ic->i_cm_id->device, &rds_iw_client);
-
- /* map the message the first time we see it */
- if (!op->op_mapped) {
- op->op_count = ib_dma_map_sg(ic->i_cm_id->device,
- op->op_sg, op->op_nents, (op->op_write) ?
- DMA_TO_DEVICE : DMA_FROM_DEVICE);
- rdsdebug("ic %p mapping op %p: %d\n", ic, op, op->op_count);
- if (op->op_count == 0) {
- rds_iw_stats_inc(s_iw_tx_sg_mapping_failure);
- ret = -ENOMEM; /* XXX ? */
- goto out;
- }
-
- op->op_mapped = 1;
- }
-
- if (!op->op_write) {
- /* Alloc space on the send queue for the fastreg */
- work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, 1, &fr_pos);
- if (work_alloc != 1) {
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
- rds_iw_stats_inc(s_iw_tx_ring_full);
- ret = -ENOMEM;
- goto out;
- }
- }
-
- /*
- * Instead of knowing how to return a partial rdma read/write we insist that there
- * be enough work requests to send the entire message.
- */
- i = ceil(op->op_count, rds_iwdev->max_sge);
-
- work_alloc = rds_iw_ring_alloc(&ic->i_send_ring, i, &pos);
- if (work_alloc != i) {
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
- rds_iw_stats_inc(s_iw_tx_ring_full);
- ret = -ENOMEM;
- goto out;
- }
-
- send = &ic->i_sends[pos];
- if (!op->op_write) {
- first = prev = &ic->i_sends[fr_pos];
- } else {
- first = send;
- prev = NULL;
- }
- scat = &op->op_sg[0];
- sent = 0;
- num_sge = op->op_count;
-
- for (i = 0; i < work_alloc && scat != &op->op_sg[op->op_count]; i++) {
- send->s_wr.send_flags = 0;
- send->s_queued = jiffies;
-
- /*
- * We want to delay signaling completions just enough to get
- * the batching benefits but not so much that we create dead time on the wire.
- */
- if (ic->i_unsignaled_wrs-- == 0) {
- ic->i_unsignaled_wrs = rds_iw_sysctl_max_unsig_wrs;
- send->s_wr.send_flags = IB_SEND_SIGNALED;
- }
-
- /* To avoid the need to have the plumbing to invalidate the fastreg_mr used
- * for local access after RDS is finished with it, using
- * IB_WR_RDMA_READ_WITH_INV will invalidate it after the read has completed.
- */
- if (op->op_write)
- send->s_wr.opcode = IB_WR_RDMA_WRITE;
- else
- send->s_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
-
- send->s_wr.wr.rdma.remote_addr = remote_addr;
- send->s_wr.wr.rdma.rkey = op->op_rkey;
- send->s_op = op;
-
- if (num_sge > rds_iwdev->max_sge) {
- send->s_wr.num_sge = rds_iwdev->max_sge;
- num_sge -= rds_iwdev->max_sge;
- } else
- send->s_wr.num_sge = num_sge;
-
- send->s_wr.next = NULL;
-
- if (prev)
- prev->s_wr.next = &send->s_wr;
-
- for (j = 0; j < send->s_wr.num_sge && scat != &op->op_sg[op->op_count]; j++) {
- len = ib_sg_dma_len(ic->i_cm_id->device, scat);
-
- if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV)
- send->s_page_list->page_list[j] = ib_sg_dma_address(ic->i_cm_id->device, scat);
- else {
- send->s_sge[j].addr = ib_sg_dma_address(ic->i_cm_id->device, scat);
- send->s_sge[j].length = len;
- send->s_sge[j].lkey = rds_iw_local_dma_lkey(ic);
- }
-
- sent += len;
- rdsdebug("ic %p sent %d remote_addr %llu\n", ic, sent, remote_addr);
- remote_addr += len;
-
- scat++;
- }
-
- if (send->s_wr.opcode == IB_WR_RDMA_READ_WITH_INV) {
- send->s_wr.num_sge = 1;
- send->s_sge[0].addr = conn->c_xmit_rm->m_rs->rs_user_addr;
- send->s_sge[0].length = conn->c_xmit_rm->m_rs->rs_user_bytes;
- send->s_sge[0].lkey = ic->i_sends[fr_pos].s_mr->lkey;
- }
-
- rdsdebug("send %p wr %p num_sge %u next %p\n", send,
- &send->s_wr, send->s_wr.num_sge, send->s_wr.next);
-
- prev = send;
- if (++send == &ic->i_sends[ic->i_send_ring.w_nr])
- send = ic->i_sends;
- }
-
- /* if we finished the message then send completion owns it */
- if (scat == &op->op_sg[op->op_count])
- first->s_wr.send_flags = IB_SEND_SIGNALED;
-
- if (i < work_alloc) {
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc - i);
- work_alloc = i;
- }
-
- /* On iWARP, local memory access by a remote system (ie, RDMA Read) is not
- * recommended. Putting the lkey on the wire is a security hole, as it can
- * allow for memory access to all of memory on the remote system. Some
- * adapters do not allow using the lkey for this at all. To bypass this use a
- * fastreg_mr (or possibly a dma_mr)
- */
- if (!op->op_write) {
- rds_iw_build_send_fastreg(rds_iwdev, ic, &ic->i_sends[fr_pos],
- op->op_count, sent, conn->c_xmit_rm->m_rs->rs_user_addr);
- work_alloc++;
- }
-
- failed_wr = &first->s_wr;
- ret = ib_post_send(ic->i_cm_id->qp, &first->s_wr, &failed_wr);
- rdsdebug("ic %p first %p (wr %p) ret %d wr %p\n", ic,
- first, &first->s_wr, ret, failed_wr);
- BUG_ON(failed_wr != &first->s_wr);
- if (ret) {
- printk(KERN_WARNING "RDS/IW: rdma ib_post_send to %pI4 "
- "returned %d\n", &conn->c_faddr, ret);
- rds_iw_ring_unalloc(&ic->i_send_ring, work_alloc);
- goto out;
- }
-
-out:
- return ret;
-}
-
-void rds_iw_xmit_complete(struct rds_connection *conn)
-{
- struct rds_iw_connection *ic = conn->c_transport_data;
-
- /* We may have a pending ACK or window update we were unable
- * to send previously (due to flow control). Try again. */
- rds_iw_attempt_ack(ic);
-}
diff --git a/net/rds/iw_stats.c b/net/rds/iw_stats.c
deleted file mode 100644
index 5fe67f6a1d80..000000000000
--- a/net/rds/iw_stats.c
+++ /dev/null
@@ -1,95 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/percpu.h>
-#include <linux/seq_file.h>
-#include <linux/proc_fs.h>
-
-#include "rds.h"
-#include "iw.h"
-
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_iw_statistics, rds_iw_stats);
-
-static const char *const rds_iw_stat_names[] = {
- "iw_connect_raced",
- "iw_listen_closed_stale",
- "iw_tx_cq_call",
- "iw_tx_cq_event",
- "iw_tx_ring_full",
- "iw_tx_throttle",
- "iw_tx_sg_mapping_failure",
- "iw_tx_stalled",
- "iw_tx_credit_updates",
- "iw_rx_cq_call",
- "iw_rx_cq_event",
- "iw_rx_ring_empty",
- "iw_rx_refill_from_cq",
- "iw_rx_refill_from_thread",
- "iw_rx_alloc_limit",
- "iw_rx_credit_updates",
- "iw_ack_sent",
- "iw_ack_send_failure",
- "iw_ack_send_delayed",
- "iw_ack_send_piggybacked",
- "iw_ack_received",
- "iw_rdma_mr_alloc",
- "iw_rdma_mr_free",
- "iw_rdma_mr_used",
- "iw_rdma_mr_pool_flush",
- "iw_rdma_mr_pool_wait",
- "iw_rdma_mr_pool_depleted",
-};
-
-unsigned int rds_iw_stats_info_copy(struct rds_info_iterator *iter,
- unsigned int avail)
-{
- struct rds_iw_statistics stats = {0, };
- uint64_t *src;
- uint64_t *sum;
- size_t i;
- int cpu;
-
- if (avail < ARRAY_SIZE(rds_iw_stat_names))
- goto out;
-
- for_each_online_cpu(cpu) {
- src = (uint64_t *)&(per_cpu(rds_iw_stats, cpu));
- sum = (uint64_t *)&stats;
- for (i = 0; i < sizeof(stats) / sizeof(uint64_t); i++)
- *(sum++) += *(src++);
- }
-
- rds_stats_info_copy(iter, (uint64_t *)&stats, rds_iw_stat_names,
- ARRAY_SIZE(rds_iw_stat_names));
-out:
- return ARRAY_SIZE(rds_iw_stat_names);
-}
diff --git a/net/rds/iw_sysctl.c b/net/rds/iw_sysctl.c
deleted file mode 100644
index 139239d2cb22..000000000000
--- a/net/rds/iw_sysctl.c
+++ /dev/null
@@ -1,123 +0,0 @@
-/*
- * Copyright (c) 2006 Oracle. All rights reserved.
- *
- * This software is available to you under a choice of one of two
- * licenses. You may choose to be licensed under the terms of the GNU
- * General Public License (GPL) Version 2, available from the file
- * COPYING in the main directory of this source tree, or the
- * OpenIB.org BSD license below:
- *
- * Redistribution and use in source and binary forms, with or
- * without modification, are permitted provided that the following
- * conditions are met:
- *
- * - Redistributions of source code must retain the above
- * copyright notice, this list of conditions and the following
- * disclaimer.
- *
- * - Redistributions in binary form must reproduce the above
- * copyright notice, this list of conditions and the following
- * disclaimer in the documentation and/or other materials
- * provided with the distribution.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
- * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
- * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
- * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
- * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
- * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
- * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- */
-#include <linux/kernel.h>
-#include <linux/sysctl.h>
-#include <linux/proc_fs.h>
-
-#include "iw.h"
-
-static struct ctl_table_header *rds_iw_sysctl_hdr;
-
-unsigned long rds_iw_sysctl_max_send_wr = RDS_IW_DEFAULT_SEND_WR;
-unsigned long rds_iw_sysctl_max_recv_wr = RDS_IW_DEFAULT_RECV_WR;
-unsigned long rds_iw_sysctl_max_recv_allocation = (128 * 1024 * 1024) / RDS_FRAG_SIZE;
-static unsigned long rds_iw_sysctl_max_wr_min = 1;
-/* hardware will fail CQ creation long before this */
-static unsigned long rds_iw_sysctl_max_wr_max = (u32)~0;
-
-unsigned long rds_iw_sysctl_max_unsig_wrs = 16;
-static unsigned long rds_iw_sysctl_max_unsig_wr_min = 1;
-static unsigned long rds_iw_sysctl_max_unsig_wr_max = 64;
-
-unsigned long rds_iw_sysctl_max_unsig_bytes = (16 << 20);
-static unsigned long rds_iw_sysctl_max_unsig_bytes_min = 1;
-static unsigned long rds_iw_sysctl_max_unsig_bytes_max = ~0UL;
-
-unsigned int rds_iw_sysctl_flow_control = 1;
-
-static struct ctl_table rds_iw_sysctl_table[] = {
- {
- .procname = "max_send_wr",
- .data = &rds_iw_sysctl_max_send_wr,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- .extra1 = &rds_iw_sysctl_max_wr_min,
- .extra2 = &rds_iw_sysctl_max_wr_max,
- },
- {
- .procname = "max_recv_wr",
- .data = &rds_iw_sysctl_max_recv_wr,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- .extra1 = &rds_iw_sysctl_max_wr_min,
- .extra2 = &rds_iw_sysctl_max_wr_max,
- },
- {
- .procname = "max_unsignaled_wr",
- .data = &rds_iw_sysctl_max_unsig_wrs,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- .extra1 = &rds_iw_sysctl_max_unsig_wr_min,
- .extra2 = &rds_iw_sysctl_max_unsig_wr_max,
- },
- {
- .procname = "max_unsignaled_bytes",
- .data = &rds_iw_sysctl_max_unsig_bytes,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- .extra1 = &rds_iw_sysctl_max_unsig_bytes_min,
- .extra2 = &rds_iw_sysctl_max_unsig_bytes_max,
- },
- {
- .procname = "max_recv_allocation",
- .data = &rds_iw_sysctl_max_recv_allocation,
- .maxlen = sizeof(unsigned long),
- .mode = 0644,
- .proc_handler = proc_doulongvec_minmax,
- },
- {
- .procname = "flow_control",
- .data = &rds_iw_sysctl_flow_control,
- .maxlen = sizeof(rds_iw_sysctl_flow_control),
- .mode = 0644,
- .proc_handler = proc_dointvec,
- },
- { }
-};
-
-void rds_iw_sysctl_exit(void)
-{
- unregister_net_sysctl_table(rds_iw_sysctl_hdr);
-}
-
-int rds_iw_sysctl_init(void)
-{
- rds_iw_sysctl_hdr = register_net_sysctl(&init_net, "net/rds/iw", rds_iw_sysctl_table);
- if (!rds_iw_sysctl_hdr)
- return -ENOMEM;
- return 0;
-}
diff --git a/net/rds/page.c b/net/rds/page.c
index 9005a2c920ee..e2b5a5832d3d 100644
--- a/net/rds/page.c
+++ b/net/rds/page.c
@@ -42,8 +42,8 @@ struct rds_page_remainder {
unsigned long r_offset;
};
-static DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder,
- rds_page_remainders);
+static
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rds_page_remainder, rds_page_remainders);
/*
* returns 0 on success or -errno on failure.
@@ -135,8 +135,8 @@ int rds_page_remainder_alloc(struct scatterlist *scat, unsigned long bytes,
if (rem->r_offset != 0)
rds_stats_inc(s_page_remainder_hit);
- rem->r_offset += bytes;
- if (rem->r_offset == PAGE_SIZE) {
+ rem->r_offset += ALIGN(bytes, 8);
+ if (rem->r_offset >= PAGE_SIZE) {
__free_page(rem->r_page);
rem->r_page = NULL;
}
@@ -179,37 +179,18 @@ out:
}
EXPORT_SYMBOL_GPL(rds_page_remainder_alloc);
-static int rds_page_remainder_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+void rds_page_exit(void)
{
- struct rds_page_remainder *rem;
- long cpu = (long)hcpu;
+ unsigned int cpu;
- rem = &per_cpu(rds_page_remainders, cpu);
+ for_each_possible_cpu(cpu) {
+ struct rds_page_remainder *rem;
- rdsdebug("cpu %ld action 0x%lx\n", cpu, action);
+ rem = &per_cpu(rds_page_remainders, cpu);
+ rdsdebug("cpu %u\n", cpu);
- switch (action) {
- case CPU_DEAD:
if (rem->r_page)
__free_page(rem->r_page);
rem->r_page = NULL;
- break;
}
-
- return 0;
-}
-
-static struct notifier_block rds_page_remainder_nb = {
- .notifier_call = rds_page_remainder_cpu_notify,
-};
-
-void rds_page_exit(void)
-{
- int i;
-
- for_each_possible_cpu(i)
- rds_page_remainder_cpu_notify(&rds_page_remainder_nb,
- (unsigned long)CPU_DEAD,
- (void *)(long)i);
}
diff --git a/net/rds/rdma_transport.c b/net/rds/rdma_transport.c
index b9b40af5345b..7220bebcf558 100644
--- a/net/rds/rdma_transport.c
+++ b/net/rds/rdma_transport.c
@@ -49,9 +49,7 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
rdsdebug("conn %p id %p handling event %u (%s)\n", conn, cm_id,
event->event, rdma_event_msg(event->event));
- if (cm_id->device->node_type == RDMA_NODE_RNIC)
- trans = &rds_iw_transport;
- else
+ if (cm_id->device->node_type == RDMA_NODE_IB_CA)
trans = &rds_ib_transport;
/* Prevent shutdown from tearing down the connection
@@ -119,6 +117,14 @@ int rds_rdma_cm_event_handler(struct rdma_cm_id *cm_id,
rds_conn_drop(conn);
break;
+ case RDMA_CM_EVENT_TIMEWAIT_EXIT:
+ if (conn) {
+ pr_info("RDS: RDMA_CM_EVENT_TIMEWAIT_EXIT event: dropping connection %pI4->%pI4\n",
+ &conn->c_laddr, &conn->c_faddr);
+ rds_conn_drop(conn);
+ }
+ break;
+
default:
/* things like device disconnect? */
printk(KERN_ERR "RDS: unknown event %u (%s)!\n",
@@ -142,8 +148,8 @@ static int rds_rdma_listen_init(void)
struct rdma_cm_id *cm_id;
int ret;
- cm_id = rdma_create_id(rds_rdma_cm_event_handler, NULL, RDMA_PS_TCP,
- IB_QPT_RC);
+ cm_id = rdma_create_id(&init_net, rds_rdma_cm_event_handler, NULL,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(cm_id)) {
ret = PTR_ERR(cm_id);
printk(KERN_ERR "RDS/RDMA: failed to setup listener, "
@@ -200,10 +206,6 @@ static int rds_rdma_init(void)
if (ret)
goto out;
- ret = rds_iw_init();
- if (ret)
- goto err_iw_init;
-
ret = rds_ib_init();
if (ret)
goto err_ib_init;
@@ -211,8 +213,6 @@ static int rds_rdma_init(void)
goto out;
err_ib_init:
- rds_iw_exit();
-err_iw_init:
rds_rdma_listen_stop();
out:
return ret;
@@ -224,11 +224,10 @@ static void rds_rdma_exit(void)
/* stop listening first to ensure no new connections are attempted */
rds_rdma_listen_stop();
rds_ib_exit();
- rds_iw_exit();
}
module_exit(rds_rdma_exit);
MODULE_AUTHOR("Oracle Corporation <rds-devel@oss.oracle.com>");
-MODULE_DESCRIPTION("RDS: IB/iWARP transport");
+MODULE_DESCRIPTION("RDS: IB transport");
MODULE_LICENSE("Dual BSD/GPL");
diff --git a/net/rds/rdma_transport.h b/net/rds/rdma_transport.h
index faba4e382695..ff2010e9d20c 100644
--- a/net/rds/rdma_transport.h
+++ b/net/rds/rdma_transport.h
@@ -16,9 +16,4 @@ extern struct rds_transport rds_ib_transport;
int rds_ib_init(void);
void rds_ib_exit(void);
-/* from iw.c */
-extern struct rds_transport rds_iw_transport;
-int rds_iw_init(void);
-void rds_iw_exit(void);
-
#endif
diff --git a/net/rds/rds.h b/net/rds/rds.h
index 0e2797bdc316..80256b08eac0 100644
--- a/net/rds/rds.h
+++ b/net/rds/rds.h
@@ -222,6 +222,7 @@ struct rds_incoming {
__be32 i_saddr;
rds_rdma_cookie_t i_rdma_cookie;
+ struct timeval i_rx_tstamp;
};
struct rds_mr {
diff --git a/net/rds/recv.c b/net/rds/recv.c
index a00462b0d01d..c0be1ecd11c9 100644
--- a/net/rds/recv.c
+++ b/net/rds/recv.c
@@ -35,6 +35,8 @@
#include <net/sock.h>
#include <linux/in.h>
#include <linux/export.h>
+#include <linux/time.h>
+#include <linux/rds.h>
#include "rds.h"
@@ -46,6 +48,8 @@ void rds_inc_init(struct rds_incoming *inc, struct rds_connection *conn,
inc->i_conn = conn;
inc->i_saddr = saddr;
inc->i_rdma_cookie = 0;
+ inc->i_rx_tstamp.tv_sec = 0;
+ inc->i_rx_tstamp.tv_usec = 0;
}
EXPORT_SYMBOL_GPL(rds_inc_init);
@@ -228,6 +232,8 @@ void rds_recv_incoming(struct rds_connection *conn, __be32 saddr, __be32 daddr,
rds_recv_rcvbuf_delta(rs, sk, inc->i_conn->c_lcong,
be32_to_cpu(inc->i_hdr.h_len),
inc->i_hdr.h_dport);
+ if (sock_flag(sk, SOCK_RCVTSTAMP))
+ do_gettimeofday(&inc->i_rx_tstamp);
rds_inc_addref(inc);
list_add_tail(&inc->i_item, &rs->rs_recv_queue);
__rds_wake_sk_sleep(sk);
@@ -381,7 +387,8 @@ static int rds_notify_cong(struct rds_sock *rs, struct msghdr *msghdr)
/*
* Receive any control messages.
*/
-static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
+static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg,
+ struct rds_sock *rs)
{
int ret = 0;
@@ -392,6 +399,15 @@ static int rds_cmsg_recv(struct rds_incoming *inc, struct msghdr *msg)
return ret;
}
+ if ((inc->i_rx_tstamp.tv_sec != 0) &&
+ sock_flag(rds_rs_to_sk(rs), SOCK_RCVTSTAMP)) {
+ ret = put_cmsg(msg, SOL_SOCKET, SCM_TIMESTAMP,
+ sizeof(struct timeval),
+ &inc->i_rx_tstamp);
+ if (ret)
+ return ret;
+ }
+
return 0;
}
@@ -474,7 +490,7 @@ int rds_recvmsg(struct socket *sock, struct msghdr *msg, size_t size,
msg->msg_flags |= MSG_TRUNC;
}
- if (rds_cmsg_recv(inc, msg)) {
+ if (rds_cmsg_recv(inc, msg, rs)) {
ret = -EFAULT;
goto out;
}
diff --git a/net/rds/send.c b/net/rds/send.c
index 827155c2ead1..c9cdb358ea88 100644
--- a/net/rds/send.c
+++ b/net/rds/send.c
@@ -1013,11 +1013,13 @@ int rds_sendmsg(struct socket *sock, struct msghdr *msg, size_t payload_len)
release_sock(sk);
}
- /* racing with another thread binding seems ok here */
+ lock_sock(sk);
if (daddr == 0 || rs->rs_bound_addr == 0) {
+ release_sock(sk);
ret = -ENOTCONN; /* XXX not a great errno */
goto out;
}
+ release_sock(sk);
if (payload_len > rds_sk_sndbuf(rs)) {
ret = -EMSGSIZE;
diff --git a/net/rds/tcp.c b/net/rds/tcp.c
index 9d6ddbacd875..61ed2a8764ba 100644
--- a/net/rds/tcp.c
+++ b/net/rds/tcp.c
@@ -37,7 +37,6 @@
#include <net/tcp.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
-#include <net/tcp.h>
#include "rds.h"
#include "tcp.h"
@@ -53,7 +52,34 @@ static LIST_HEAD(rds_tcp_conn_list);
static struct kmem_cache *rds_tcp_conn_slab;
-#define RDS_TCP_DEFAULT_BUFSIZE (128 * 1024)
+static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *fpos);
+
+int rds_tcp_min_sndbuf = SOCK_MIN_SNDBUF;
+int rds_tcp_min_rcvbuf = SOCK_MIN_RCVBUF;
+
+static struct ctl_table rds_tcp_sysctl_table[] = {
+#define RDS_TCP_SNDBUF 0
+ {
+ .procname = "rds_tcp_sndbuf",
+ /* data is per-net pointer */
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = rds_tcp_skbuf_handler,
+ .extra1 = &rds_tcp_min_sndbuf,
+ },
+#define RDS_TCP_RCVBUF 1
+ {
+ .procname = "rds_tcp_rcvbuf",
+ /* data is per-net pointer */
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = rds_tcp_skbuf_handler,
+ .extra1 = &rds_tcp_min_rcvbuf,
+ },
+ { }
+};
/* doing it this way avoids calling tcp_sk() */
void rds_tcp_nonagle(struct socket *sock)
@@ -67,15 +93,6 @@ void rds_tcp_nonagle(struct socket *sock)
set_fs(oldfs);
}
-/* All module specific customizations to the RDS-TCP socket should be done in
- * rds_tcp_tune() and applied after socket creation. In general these
- * customizations should be tunable via module_param()
- */
-void rds_tcp_tune(struct socket *sock)
-{
- rds_tcp_nonagle(sock);
-}
-
u32 rds_tcp_snd_nxt(struct rds_tcp_connection *tc)
{
return tcp_sk(tc->t_sock->sk)->snd_nxt;
@@ -273,8 +290,34 @@ static int rds_tcp_netid;
struct rds_tcp_net {
struct socket *rds_tcp_listen_sock;
struct work_struct rds_tcp_accept_w;
+ struct ctl_table_header *rds_tcp_sysctl;
+ struct ctl_table *ctl_table;
+ int sndbuf_size;
+ int rcvbuf_size;
};
+/* All module specific customizations to the RDS-TCP socket should be done in
+ * rds_tcp_tune() and applied after socket creation.
+ */
+void rds_tcp_tune(struct socket *sock)
+{
+ struct sock *sk = sock->sk;
+ struct net *net = sock_net(sk);
+ struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+
+ rds_tcp_nonagle(sock);
+ lock_sock(sk);
+ if (rtn->sndbuf_size > 0) {
+ sk->sk_sndbuf = rtn->sndbuf_size;
+ sk->sk_userlocks |= SOCK_SNDBUF_LOCK;
+ }
+ if (rtn->rcvbuf_size > 0) {
+ sk->sk_sndbuf = rtn->rcvbuf_size;
+ sk->sk_userlocks |= SOCK_RCVBUF_LOCK;
+ }
+ release_sock(sk);
+}
+
static void rds_tcp_accept_worker(struct work_struct *work)
{
struct rds_tcp_net *rtn = container_of(work,
@@ -296,20 +339,60 @@ void rds_tcp_accept_work(struct sock *sk)
static __net_init int rds_tcp_init_net(struct net *net)
{
struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+ struct ctl_table *tbl;
+ int err = 0;
+ memset(rtn, 0, sizeof(*rtn));
+
+ /* {snd, rcv}buf_size default to 0, which implies we let the
+ * stack pick the value, and permit auto-tuning of buffer size.
+ */
+ if (net == &init_net) {
+ tbl = rds_tcp_sysctl_table;
+ } else {
+ tbl = kmemdup(rds_tcp_sysctl_table,
+ sizeof(rds_tcp_sysctl_table), GFP_KERNEL);
+ if (!tbl) {
+ pr_warn("could not set allocate syctl table\n");
+ return -ENOMEM;
+ }
+ rtn->ctl_table = tbl;
+ }
+ tbl[RDS_TCP_SNDBUF].data = &rtn->sndbuf_size;
+ tbl[RDS_TCP_RCVBUF].data = &rtn->rcvbuf_size;
+ rtn->rds_tcp_sysctl = register_net_sysctl(net, "net/rds/tcp", tbl);
+ if (!rtn->rds_tcp_sysctl) {
+ pr_warn("could not register sysctl\n");
+ err = -ENOMEM;
+ goto fail;
+ }
rtn->rds_tcp_listen_sock = rds_tcp_listen_init(net);
if (!rtn->rds_tcp_listen_sock) {
pr_warn("could not set up listen sock\n");
- return -EAFNOSUPPORT;
+ unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+ rtn->rds_tcp_sysctl = NULL;
+ err = -EAFNOSUPPORT;
+ goto fail;
}
INIT_WORK(&rtn->rds_tcp_accept_w, rds_tcp_accept_worker);
return 0;
+
+fail:
+ if (net != &init_net)
+ kfree(tbl);
+ return err;
}
static void __net_exit rds_tcp_exit_net(struct net *net)
{
struct rds_tcp_net *rtn = net_generic(net, rds_tcp_netid);
+ if (rtn->rds_tcp_sysctl)
+ unregister_net_sysctl_table(rtn->rds_tcp_sysctl);
+
+ if (net != &init_net && rtn->ctl_table)
+ kfree(rtn->ctl_table);
+
/* If rds_tcp_exit_net() is called as a result of netns deletion,
* the rds_tcp_kill_sock() device notifier would already have cleaned
* up the listen socket, thus there is no work to do in this function.
@@ -384,6 +467,45 @@ static struct notifier_block rds_tcp_dev_notifier = {
.priority = -10, /* must be called after other network notifiers */
};
+/* when sysctl is used to modify some kernel socket parameters,this
+ * function resets the RDS connections in that netns so that we can
+ * restart with new parameters. The assumption is that such reset
+ * events are few and far-between.
+ */
+static void rds_tcp_sysctl_reset(struct net *net)
+{
+ struct rds_tcp_connection *tc, *_tc;
+
+ spin_lock_irq(&rds_tcp_conn_lock);
+ list_for_each_entry_safe(tc, _tc, &rds_tcp_conn_list, t_tcp_node) {
+ struct net *c_net = read_pnet(&tc->conn->c_net);
+
+ if (net != c_net || !tc->t_sock)
+ continue;
+
+ rds_conn_drop(tc->conn); /* reconnect with new parameters */
+ }
+ spin_unlock_irq(&rds_tcp_conn_lock);
+}
+
+static int rds_tcp_skbuf_handler(struct ctl_table *ctl, int write,
+ void __user *buffer, size_t *lenp,
+ loff_t *fpos)
+{
+ struct net *net = current->nsproxy->net_ns;
+ int err;
+
+ err = proc_dointvec_minmax(ctl, write, buffer, lenp, fpos);
+ if (err < 0) {
+ pr_warn("Invalid input. Must be >= %d\n",
+ *(int *)(ctl->extra1));
+ return err;
+ }
+ if (write)
+ rds_tcp_sysctl_reset(net);
+ return 0;
+}
+
static void rds_tcp_exit(void)
{
rds_info_deregister_func(RDS_INFO_TCP_SOCKETS, rds_tcp_tc_info);
diff --git a/net/rfkill/Kconfig b/net/rfkill/Kconfig
index 598d374f6a35..868f1ad0415a 100644
--- a/net/rfkill/Kconfig
+++ b/net/rfkill/Kconfig
@@ -41,5 +41,4 @@ config RFKILL_GPIO
default n
help
If you say yes here you get support of a generic gpio RFKILL
- driver. The platform should fill in the appropriate fields in the
- rfkill_gpio_platform_data structure and pass that to the driver.
+ driver.
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index b41e9ea2ffff..03f26e3a6f48 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -49,7 +49,6 @@
struct rfkill {
spinlock_t lock;
- const char *name;
enum rfkill_type type;
unsigned long state;
@@ -58,6 +57,8 @@ struct rfkill {
bool registered;
bool persistent;
+ bool polling_paused;
+ bool suspended;
const struct rfkill_ops *ops;
void *data;
@@ -73,6 +74,7 @@ struct rfkill {
struct delayed_work poll_work;
struct work_struct uevent_work;
struct work_struct sync_work;
+ char name[];
};
#define to_rfkill(d) container_of(d, struct rfkill, dev)
@@ -233,29 +235,6 @@ static void rfkill_event(struct rfkill *rfkill)
rfkill_send_events(rfkill, RFKILL_OP_CHANGE);
}
-static bool __rfkill_set_hw_state(struct rfkill *rfkill,
- bool blocked, bool *change)
-{
- unsigned long flags;
- bool prev, any;
-
- BUG_ON(!rfkill);
-
- spin_lock_irqsave(&rfkill->lock, flags);
- prev = !!(rfkill->state & RFKILL_BLOCK_HW);
- if (blocked)
- rfkill->state |= RFKILL_BLOCK_HW;
- else
- rfkill->state &= ~RFKILL_BLOCK_HW;
- *change = prev != blocked;
- any = !!(rfkill->state & RFKILL_BLOCK_ANY);
- spin_unlock_irqrestore(&rfkill->lock, flags);
-
- rfkill_led_trigger_event(rfkill);
-
- return any;
-}
-
/**
* rfkill_set_block - wrapper for set_block method
*
@@ -285,7 +264,7 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
spin_lock_irqsave(&rfkill->lock, flags);
prev = rfkill->state & RFKILL_BLOCK_SW;
- if (rfkill->state & RFKILL_BLOCK_SW)
+ if (prev)
rfkill->state |= RFKILL_BLOCK_SW_PREV;
else
rfkill->state &= ~RFKILL_BLOCK_SW_PREV;
@@ -303,8 +282,8 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
spin_lock_irqsave(&rfkill->lock, flags);
if (err) {
/*
- * Failed -- reset status to _prev, this may be different
- * from what set set _PREV to earlier in this function
+ * Failed -- reset status to _PREV, which may be different
+ * from what we have set _PREV to earlier in this function
* if rfkill_set_sw_state was invoked.
*/
if (rfkill->state & RFKILL_BLOCK_SW_PREV)
@@ -323,6 +302,19 @@ static void rfkill_set_block(struct rfkill *rfkill, bool blocked)
rfkill_event(rfkill);
}
+static void rfkill_update_global_state(enum rfkill_type type, bool blocked)
+{
+ int i;
+
+ if (type != RFKILL_TYPE_ALL) {
+ rfkill_global_states[type].cur = blocked;
+ return;
+ }
+
+ for (i = 0; i < NUM_RFKILL_TYPES; i++)
+ rfkill_global_states[i].cur = blocked;
+}
+
#ifdef CONFIG_RFKILL_INPUT
static atomic_t rfkill_input_disabled = ATOMIC_INIT(0);
@@ -332,8 +324,7 @@ static atomic_t rfkill_input_disabled = ATOMIC_INIT(0);
* @blocked: the new state
*
* This function sets the state of all switches of given type,
- * unless a specific switch is claimed by userspace (in which case,
- * that switch is left alone) or suspended.
+ * unless a specific switch is suspended.
*
* Caller must have acquired rfkill_global_mutex.
*/
@@ -341,15 +332,7 @@ static void __rfkill_switch_all(const enum rfkill_type type, bool blocked)
{
struct rfkill *rfkill;
- if (type == RFKILL_TYPE_ALL) {
- int i;
-
- for (i = 0; i < NUM_RFKILL_TYPES; i++)
- rfkill_global_states[i].cur = blocked;
- } else {
- rfkill_global_states[type].cur = blocked;
- }
-
+ rfkill_update_global_state(type, blocked);
list_for_each_entry(rfkill, &rfkill_list, node) {
if (rfkill->type != type && type != RFKILL_TYPE_ALL)
continue;
@@ -477,17 +460,28 @@ bool rfkill_get_global_sw_state(const enum rfkill_type type)
}
#endif
-
bool rfkill_set_hw_state(struct rfkill *rfkill, bool blocked)
{
- bool ret, change;
+ unsigned long flags;
+ bool ret, prev;
+
+ BUG_ON(!rfkill);
+
+ spin_lock_irqsave(&rfkill->lock, flags);
+ prev = !!(rfkill->state & RFKILL_BLOCK_HW);
+ if (blocked)
+ rfkill->state |= RFKILL_BLOCK_HW;
+ else
+ rfkill->state &= ~RFKILL_BLOCK_HW;
+ ret = !!(rfkill->state & RFKILL_BLOCK_ANY);
+ spin_unlock_irqrestore(&rfkill->lock, flags);
- ret = __rfkill_set_hw_state(rfkill, blocked, &change);
+ rfkill_led_trigger_event(rfkill);
if (!rfkill->registered)
return ret;
- if (change)
+ if (prev != blocked)
schedule_work(&rfkill->uevent_work);
return ret;
@@ -582,6 +576,34 @@ void rfkill_set_states(struct rfkill *rfkill, bool sw, bool hw)
}
EXPORT_SYMBOL(rfkill_set_states);
+static const char * const rfkill_types[] = {
+ NULL, /* RFKILL_TYPE_ALL */
+ "wlan",
+ "bluetooth",
+ "ultrawideband",
+ "wimax",
+ "wwan",
+ "gps",
+ "fm",
+ "nfc",
+};
+
+enum rfkill_type rfkill_find_type(const char *name)
+{
+ int i;
+
+ BUILD_BUG_ON(ARRAY_SIZE(rfkill_types) != NUM_RFKILL_TYPES);
+
+ if (!name)
+ return RFKILL_TYPE_ALL;
+
+ for (i = 1; i < NUM_RFKILL_TYPES; i++)
+ if (!strcmp(name, rfkill_types[i]))
+ return i;
+ return RFKILL_TYPE_ALL;
+}
+EXPORT_SYMBOL(rfkill_find_type);
+
static ssize_t name_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
@@ -591,38 +613,12 @@ static ssize_t name_show(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RO(name);
-static const char *rfkill_get_type_str(enum rfkill_type type)
-{
- BUILD_BUG_ON(NUM_RFKILL_TYPES != RFKILL_TYPE_NFC + 1);
-
- switch (type) {
- case RFKILL_TYPE_WLAN:
- return "wlan";
- case RFKILL_TYPE_BLUETOOTH:
- return "bluetooth";
- case RFKILL_TYPE_UWB:
- return "ultrawideband";
- case RFKILL_TYPE_WIMAX:
- return "wimax";
- case RFKILL_TYPE_WWAN:
- return "wwan";
- case RFKILL_TYPE_GPS:
- return "gps";
- case RFKILL_TYPE_FM:
- return "fm";
- case RFKILL_TYPE_NFC:
- return "nfc";
- default:
- BUG();
- }
-}
-
static ssize_t type_show(struct device *dev, struct device_attribute *attr,
char *buf)
{
struct rfkill *rfkill = to_rfkill(dev);
- return sprintf(buf, "%s\n", rfkill_get_type_str(rfkill->type));
+ return sprintf(buf, "%s\n", rfkill_types[rfkill->type]);
}
static DEVICE_ATTR_RO(type);
@@ -730,20 +726,12 @@ static ssize_t state_store(struct device *dev, struct device_attribute *attr,
}
static DEVICE_ATTR_RW(state);
-static ssize_t claim_show(struct device *dev, struct device_attribute *attr,
- char *buf)
-{
- return sprintf(buf, "%d\n", 0);
-}
-static DEVICE_ATTR_RO(claim);
-
static struct attribute *rfkill_dev_attrs[] = {
&dev_attr_name.attr,
&dev_attr_type.attr,
&dev_attr_index.attr,
&dev_attr_persistent.attr,
&dev_attr_state.attr,
- &dev_attr_claim.attr,
&dev_attr_soft.attr,
&dev_attr_hard.attr,
NULL,
@@ -768,7 +756,7 @@ static int rfkill_dev_uevent(struct device *dev, struct kobj_uevent_env *env)
if (error)
return error;
error = add_uevent_var(env, "RFKILL_TYPE=%s",
- rfkill_get_type_str(rfkill->type));
+ rfkill_types[rfkill->type]);
if (error)
return error;
spin_lock_irqsave(&rfkill->lock, flags);
@@ -786,6 +774,7 @@ void rfkill_pause_polling(struct rfkill *rfkill)
if (!rfkill->ops->poll)
return;
+ rfkill->polling_paused = true;
cancel_delayed_work_sync(&rfkill->poll_work);
}
EXPORT_SYMBOL(rfkill_pause_polling);
@@ -797,6 +786,11 @@ void rfkill_resume_polling(struct rfkill *rfkill)
if (!rfkill->ops->poll)
return;
+ rfkill->polling_paused = false;
+
+ if (rfkill->suspended)
+ return;
+
queue_delayed_work(system_power_efficient_wq,
&rfkill->poll_work, 0);
}
@@ -807,7 +801,8 @@ static int rfkill_suspend(struct device *dev)
{
struct rfkill *rfkill = to_rfkill(dev);
- rfkill_pause_polling(rfkill);
+ rfkill->suspended = true;
+ cancel_delayed_work_sync(&rfkill->poll_work);
return 0;
}
@@ -817,12 +812,16 @@ static int rfkill_resume(struct device *dev)
struct rfkill *rfkill = to_rfkill(dev);
bool cur;
+ rfkill->suspended = false;
+
if (!rfkill->persistent) {
cur = !!(rfkill->state & RFKILL_BLOCK_SW);
rfkill_set_block(rfkill, cur);
}
- rfkill_resume_polling(rfkill);
+ if (rfkill->ops->poll && !rfkill->polling_paused)
+ queue_delayed_work(system_power_efficient_wq,
+ &rfkill->poll_work, 0);
return 0;
}
@@ -876,14 +875,14 @@ struct rfkill * __must_check rfkill_alloc(const char *name,
if (WARN_ON(type == RFKILL_TYPE_ALL || type >= NUM_RFKILL_TYPES))
return NULL;
- rfkill = kzalloc(sizeof(*rfkill), GFP_KERNEL);
+ rfkill = kzalloc(sizeof(*rfkill) + strlen(name) + 1, GFP_KERNEL);
if (!rfkill)
return NULL;
spin_lock_init(&rfkill->lock);
INIT_LIST_HEAD(&rfkill->node);
rfkill->type = type;
- rfkill->name = name;
+ strcpy(rfkill->name, name);
rfkill->ops = ops;
rfkill->data = ops_data;
@@ -1095,17 +1094,6 @@ static unsigned int rfkill_fop_poll(struct file *file, poll_table *wait)
return res;
}
-static bool rfkill_readable(struct rfkill_data *data)
-{
- bool r;
-
- mutex_lock(&data->mtx);
- r = !list_empty(&data->events);
- mutex_unlock(&data->mtx);
-
- return r;
-}
-
static ssize_t rfkill_fop_read(struct file *file, char __user *buf,
size_t count, loff_t *pos)
{
@@ -1122,8 +1110,11 @@ static ssize_t rfkill_fop_read(struct file *file, char __user *buf,
goto out;
}
mutex_unlock(&data->mtx);
+ /* since we re-check and it just compares pointers,
+ * using !list_empty() without locking isn't a problem
+ */
ret = wait_event_interruptible(data->read_wait,
- rfkill_readable(data));
+ !list_empty(&data->events));
mutex_lock(&data->mtx);
if (ret)
@@ -1172,15 +1163,8 @@ static ssize_t rfkill_fop_write(struct file *file, const char __user *buf,
mutex_lock(&rfkill_global_mutex);
- if (ev.op == RFKILL_OP_CHANGE_ALL) {
- if (ev.type == RFKILL_TYPE_ALL) {
- enum rfkill_type i;
- for (i = 0; i < NUM_RFKILL_TYPES; i++)
- rfkill_global_states[i].cur = ev.soft;
- } else {
- rfkill_global_states[ev.type].cur = ev.soft;
- }
- }
+ if (ev.op == RFKILL_OP_CHANGE_ALL)
+ rfkill_update_global_state(ev.type, ev.soft);
list_for_each_entry(rfkill, &rfkill_list, node) {
if (rfkill->idx != ev.idx && ev.op != RFKILL_OP_CHANGE_ALL)
@@ -1269,10 +1253,8 @@ static struct miscdevice rfkill_miscdev = {
static int __init rfkill_init(void)
{
int error;
- int i;
- for (i = 0; i < NUM_RFKILL_TYPES; i++)
- rfkill_global_states[i].cur = !rfkill_default_state;
+ rfkill_update_global_state(RFKILL_TYPE_ALL, !rfkill_default_state);
error = class_register(&rfkill_class);
if (error)
diff --git a/net/rfkill/rfkill-gpio.c b/net/rfkill/rfkill-gpio.c
index 93127220cb54..76c01cbd56e3 100644
--- a/net/rfkill/rfkill-gpio.c
+++ b/net/rfkill/rfkill-gpio.c
@@ -27,8 +27,6 @@
#include <linux/acpi.h>
#include <linux/gpio/consumer.h>
-#include <linux/rfkill-gpio.h>
-
struct rfkill_gpio_data {
const char *name;
enum rfkill_type type;
@@ -81,7 +79,6 @@ static int rfkill_gpio_acpi_probe(struct device *dev,
if (!id)
return -ENODEV;
- rfkill->name = dev_name(dev);
rfkill->type = (unsigned)id->driver_data;
return acpi_dev_add_driver_gpios(ACPI_COMPANION(dev),
@@ -90,24 +87,27 @@ static int rfkill_gpio_acpi_probe(struct device *dev,
static int rfkill_gpio_probe(struct platform_device *pdev)
{
- struct rfkill_gpio_platform_data *pdata = pdev->dev.platform_data;
struct rfkill_gpio_data *rfkill;
struct gpio_desc *gpio;
+ const char *type_name;
int ret;
rfkill = devm_kzalloc(&pdev->dev, sizeof(*rfkill), GFP_KERNEL);
if (!rfkill)
return -ENOMEM;
+ device_property_read_string(&pdev->dev, "name", &rfkill->name);
+ device_property_read_string(&pdev->dev, "type", &type_name);
+
+ if (!rfkill->name)
+ rfkill->name = dev_name(&pdev->dev);
+
+ rfkill->type = rfkill_find_type(type_name);
+
if (ACPI_HANDLE(&pdev->dev)) {
ret = rfkill_gpio_acpi_probe(&pdev->dev, rfkill);
if (ret)
return ret;
- } else if (pdata) {
- rfkill->name = pdata->name;
- rfkill->type = pdata->type;
- } else {
- return -ENODEV;
}
rfkill->clk = devm_clk_get(&pdev->dev, NULL);
@@ -124,10 +124,8 @@ static int rfkill_gpio_probe(struct platform_device *pdev)
rfkill->shutdown_gpio = gpio;
- /* Make sure at-least one of the GPIO is defined and that
- * a name is specified for this instance
- */
- if ((!rfkill->reset_gpio && !rfkill->shutdown_gpio) || !rfkill->name) {
+ /* Make sure at-least one GPIO is defined for this instance */
+ if (!rfkill->reset_gpio && !rfkill->shutdown_gpio) {
dev_err(&pdev->dev, "invalid platform data\n");
return -EINVAL;
}
@@ -163,10 +161,6 @@ static int rfkill_gpio_remove(struct platform_device *pdev)
#ifdef CONFIG_ACPI
static const struct acpi_device_id rfkill_acpi_match[] = {
- { "BCM2E1A", RFKILL_TYPE_BLUETOOTH },
- { "BCM2E3D", RFKILL_TYPE_BLUETOOTH },
- { "BCM2E40", RFKILL_TYPE_BLUETOOTH },
- { "BCM2E64", RFKILL_TYPE_BLUETOOTH },
{ "BCM4752", RFKILL_TYPE_GPS },
{ "LNV4752", RFKILL_TYPE_GPS },
{ },
diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
index 1f8a144a5dc2..9d935fa5a2a9 100644
--- a/net/rxrpc/af_rxrpc.c
+++ b/net/rxrpc/af_rxrpc.c
@@ -37,7 +37,7 @@ static struct proto rxrpc_proto;
static const struct proto_ops rxrpc_rpc_ops;
/* local epoch for detecting local-end reset */
-__be32 rxrpc_epoch;
+u32 rxrpc_epoch;
/* current debugging ID */
atomic_t rxrpc_debug_id;
@@ -67,7 +67,7 @@ static void rxrpc_write_space(struct sock *sk)
if (rxrpc_writable(sk)) {
struct socket_wq *wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible(&wq->wait);
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
}
@@ -81,6 +81,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
struct sockaddr_rxrpc *srx,
int len)
{
+ unsigned int tail;
+
if (len < sizeof(struct sockaddr_rxrpc))
return -EINVAL;
@@ -103,9 +105,7 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
_debug("INET: %x @ %pI4",
ntohs(srx->transport.sin.sin_port),
&srx->transport.sin.sin_addr);
- if (srx->transport_len > 8)
- memset((void *)&srx->transport + 8, 0,
- srx->transport_len - 8);
+ tail = offsetof(struct sockaddr_rxrpc, transport.sin.__pad);
break;
case AF_INET6:
@@ -113,6 +113,8 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
return -EAFNOSUPPORT;
}
+ if (tail < len)
+ memset((void *)srx + tail, 0, len - tail);
return 0;
}
@@ -121,11 +123,10 @@ static int rxrpc_validate_address(struct rxrpc_sock *rx,
*/
static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
{
- struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *) saddr;
+ struct sockaddr_rxrpc *srx = (struct sockaddr_rxrpc *)saddr;
struct sock *sk = sock->sk;
struct rxrpc_local *local;
struct rxrpc_sock *rx = rxrpc_sk(sk), *prx;
- __be16 service_id;
int ret;
_enter("%p,%p,%d", rx, saddr, len);
@@ -143,7 +144,7 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
memcpy(&rx->srx, srx, sizeof(rx->srx));
- /* find a local transport endpoint if we don't have one already */
+ /* Find or create a local transport endpoint to use */
local = rxrpc_lookup_local(&rx->srx);
if (IS_ERR(local)) {
ret = PTR_ERR(local);
@@ -152,14 +153,12 @@ static int rxrpc_bind(struct socket *sock, struct sockaddr *saddr, int len)
rx->local = local;
if (srx->srx_service) {
- service_id = htons(srx->srx_service);
write_lock_bh(&local->services_lock);
list_for_each_entry(prx, &local->services, listen_link) {
- if (prx->service_id == service_id)
+ if (prx->srx.srx_service == srx->srx_service)
goto service_in_use;
}
- rx->service_id = service_id;
list_add_tail(&rx->listen_link, &local->services);
write_unlock_bh(&local->services_lock);
@@ -276,7 +275,6 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
struct rxrpc_transport *trans;
struct rxrpc_call *call;
struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
- __be16 service_id;
_enter(",,%x,%lx", key_serial(key), user_call_ID);
@@ -299,16 +297,14 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
atomic_inc(&trans->usage);
}
- service_id = rx->service_id;
- if (srx)
- service_id = htons(srx->srx_service);
-
+ if (!srx)
+ srx = &rx->srx;
if (!key)
key = rx->key;
if (key && !key->payload.data[0])
key = NULL; /* a no-security key */
- bundle = rxrpc_get_bundle(rx, trans, key, service_id, gfp);
+ bundle = rxrpc_get_bundle(rx, trans, key, srx->srx_service, gfp);
if (IS_ERR(bundle)) {
call = ERR_CAST(bundle);
goto out;
@@ -324,7 +320,6 @@ out_notrans:
_leave(" = %p", call);
return call;
}
-
EXPORT_SYMBOL(rxrpc_kernel_begin_call);
/**
@@ -340,7 +335,6 @@ void rxrpc_kernel_end_call(struct rxrpc_call *call)
rxrpc_remove_user_ID(call->socket, call);
rxrpc_put_call(call);
}
-
EXPORT_SYMBOL(rxrpc_kernel_end_call);
/**
@@ -425,7 +419,6 @@ static int rxrpc_connect(struct socket *sock, struct sockaddr *addr,
}
rx->trans = trans;
- rx->service_id = htons(srx->srx_service);
rx->sk.sk_state = RXRPC_CLIENT_CONNECTED;
release_sock(&rx->sk);
@@ -622,7 +615,7 @@ static int rxrpc_create(struct net *net, struct socket *sock, int protocol,
if (!net_eq(net, &init_net))
return -EAFNOSUPPORT;
- /* we support transport protocol UDP only */
+ /* we support transport protocol UDP/UDP6 only */
if (protocol != PF_INET)
return -EPROTONOSUPPORT;
@@ -754,7 +747,7 @@ static int rxrpc_release(struct socket *sock)
* RxRPC network protocol
*/
static const struct proto_ops rxrpc_rpc_ops = {
- .family = PF_UNIX,
+ .family = PF_RXRPC,
.owner = THIS_MODULE,
.release = rxrpc_release,
.bind = rxrpc_bind,
@@ -778,7 +771,7 @@ static struct proto rxrpc_proto = {
.name = "RXRPC",
.owner = THIS_MODULE,
.obj_size = sizeof(struct rxrpc_sock),
- .max_header = sizeof(struct rxrpc_header),
+ .max_header = sizeof(struct rxrpc_wire_header),
};
static const struct net_proto_family rxrpc_family_ops = {
@@ -796,7 +789,7 @@ static int __init af_rxrpc_init(void)
BUILD_BUG_ON(sizeof(struct rxrpc_skb_priv) > FIELD_SIZEOF(struct sk_buff, cb));
- rxrpc_epoch = htonl(get_seconds());
+ rxrpc_epoch = get_seconds();
ret = -ENOMEM;
rxrpc_call_jar = kmem_cache_create(
diff --git a/net/rxrpc/ar-accept.c b/net/rxrpc/ar-accept.c
index 6d79310fcaae..277731a5e67a 100644
--- a/net/rxrpc/ar-accept.c
+++ b/net/rxrpc/ar-accept.c
@@ -27,7 +27,7 @@
* generate a connection-level abort
*/
static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx,
- struct rxrpc_header *hdr)
+ struct rxrpc_wire_header *whdr)
{
struct msghdr msg;
struct kvec iov[1];
@@ -36,25 +36,21 @@ static int rxrpc_busy(struct rxrpc_local *local, struct sockaddr_rxrpc *srx,
_enter("%d,,", local->debug_id);
+ whdr->type = RXRPC_PACKET_TYPE_BUSY;
+ whdr->serial = htonl(1);
+
msg.msg_name = &srx->transport.sin;
msg.msg_namelen = sizeof(srx->transport.sin);
msg.msg_control = NULL;
msg.msg_controllen = 0;
msg.msg_flags = 0;
- hdr->seq = 0;
- hdr->type = RXRPC_PACKET_TYPE_BUSY;
- hdr->flags = 0;
- hdr->userStatus = 0;
- hdr->_rsvd = 0;
-
- iov[0].iov_base = hdr;
- iov[0].iov_len = sizeof(*hdr);
+ iov[0].iov_base = whdr;
+ iov[0].iov_len = sizeof(*whdr);
len = iov[0].iov_len;
- hdr->serial = htonl(1);
- _proto("Tx BUSY %%%u", ntohl(hdr->serial));
+ _proto("Tx BUSY %%1");
ret = kernel_sendmsg(local->socket, &msg, iov, 1, len);
if (ret < 0) {
@@ -185,8 +181,8 @@ invalid_service:
read_unlock_bh(&local->services_lock);
read_lock_bh(&call->state_lock);
- if (!test_bit(RXRPC_CALL_RELEASE, &call->flags) &&
- !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events)) {
+ if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
+ !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
rxrpc_get_call(call);
rxrpc_queue_call(call);
}
@@ -211,8 +207,8 @@ void rxrpc_accept_incoming_calls(struct work_struct *work)
struct rxrpc_skb_priv *sp;
struct sockaddr_rxrpc srx;
struct rxrpc_sock *rx;
+ struct rxrpc_wire_header whdr;
struct sk_buff *skb;
- __be16 service_id;
int ret;
_enter("%d", local->debug_id);
@@ -240,6 +236,19 @@ process_next_packet:
sp = rxrpc_skb(skb);
+ /* Set up a response packet header in case we need it */
+ whdr.epoch = htonl(sp->hdr.epoch);
+ whdr.cid = htonl(sp->hdr.cid);
+ whdr.callNumber = htonl(sp->hdr.callNumber);
+ whdr.seq = htonl(sp->hdr.seq);
+ whdr.serial = 0;
+ whdr.flags = 0;
+ whdr.type = 0;
+ whdr.userStatus = 0;
+ whdr.securityIndex = sp->hdr.securityIndex;
+ whdr._rsvd = 0;
+ whdr.serviceId = htons(sp->hdr.serviceId);
+
/* determine the remote address */
memset(&srx, 0, sizeof(srx));
srx.srx_family = AF_RXRPC;
@@ -256,10 +265,9 @@ process_next_packet:
}
/* get the socket providing the service */
- service_id = sp->hdr.serviceId;
read_lock_bh(&local->services_lock);
list_for_each_entry(rx, &local->services, listen_link) {
- if (rx->service_id == service_id &&
+ if (rx->srx.srx_service == sp->hdr.serviceId &&
rx->sk.sk_state != RXRPC_CLOSE)
goto found_service;
}
@@ -267,7 +275,7 @@ process_next_packet:
goto invalid_service;
found_service:
- _debug("found service %hd", ntohs(rx->service_id));
+ _debug("found service %hd", rx->srx.srx_service);
if (sk_acceptq_is_full(&rx->sk))
goto backlog_full;
sk_acceptq_added(&rx->sk);
@@ -296,7 +304,7 @@ found_service:
backlog_full:
read_unlock_bh(&local->services_lock);
busy:
- rxrpc_busy(local, &srx, &sp->hdr);
+ rxrpc_busy(local, &srx, &whdr);
rxrpc_free_skb(skb);
goto process_next_packet;
@@ -379,7 +387,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
rb_insert_color(&call->sock_node, &rx->calls);
if (test_and_set_bit(RXRPC_CALL_HAS_USERID, &call->flags))
BUG();
- if (test_and_set_bit(RXRPC_CALL_ACCEPTED, &call->events))
+ if (test_and_set_bit(RXRPC_CALL_EV_ACCEPTED, &call->events))
BUG();
rxrpc_queue_call(call);
@@ -395,7 +403,7 @@ struct rxrpc_call *rxrpc_accept_call(struct rxrpc_sock *rx,
out_release:
_debug("release %p", call);
if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+ !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
rxrpc_queue_call(call);
out_discard:
write_unlock_bh(&call->state_lock);
@@ -407,7 +415,7 @@ out:
}
/*
- * handle rejectance of a call by userspace
+ * Handle rejection of a call by userspace
* - reject the call at the front of the queue
*/
int rxrpc_reject_call(struct rxrpc_sock *rx)
@@ -434,7 +442,7 @@ int rxrpc_reject_call(struct rxrpc_sock *rx)
switch (call->state) {
case RXRPC_CALL_SERVER_ACCEPTING:
call->state = RXRPC_CALL_SERVER_BUSY;
- if (test_and_set_bit(RXRPC_CALL_REJECT_BUSY, &call->events))
+ if (test_and_set_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events))
rxrpc_queue_call(call);
ret = 0;
goto out_release;
@@ -458,7 +466,7 @@ int rxrpc_reject_call(struct rxrpc_sock *rx)
out_release:
_debug("release %p", call);
if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+ !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
rxrpc_queue_call(call);
out_discard:
write_unlock_bh(&call->state_lock);
@@ -487,7 +495,6 @@ struct rxrpc_call *rxrpc_kernel_accept_call(struct socket *sock,
_leave(" = %p", call);
return call;
}
-
EXPORT_SYMBOL(rxrpc_kernel_accept_call);
/**
@@ -506,5 +513,4 @@ int rxrpc_kernel_reject_call(struct socket *sock)
_leave(" = %d", ret);
return ret;
}
-
EXPORT_SYMBOL(rxrpc_kernel_reject_call);
diff --git a/net/rxrpc/ar-ack.c b/net/rxrpc/ar-ack.c
index e0547f521f20..16d967075eaf 100644
--- a/net/rxrpc/ar-ack.c
+++ b/net/rxrpc/ar-ack.c
@@ -23,7 +23,7 @@
* How long to wait before scheduling ACK generation after seeing a
* packet with RXRPC_REQUEST_ACK set (in jiffies).
*/
-unsigned rxrpc_requested_ack_delay = 1;
+unsigned int rxrpc_requested_ack_delay = 1;
/*
* How long to wait before scheduling an ACK with subtype DELAY (in jiffies).
@@ -32,7 +32,7 @@ unsigned rxrpc_requested_ack_delay = 1;
* all consumed within this time we will send a DELAY ACK if an ACK was not
* requested to let the sender know it doesn't need to resend.
*/
-unsigned rxrpc_soft_ack_delay = 1 * HZ;
+unsigned int rxrpc_soft_ack_delay = 1 * HZ;
/*
* How long to wait before scheduling an ACK with subtype IDLE (in jiffies).
@@ -41,7 +41,7 @@ unsigned rxrpc_soft_ack_delay = 1 * HZ;
* further packets aren't immediately received to decide when to send an IDLE
* ACK let the other end know that it can free up its Tx buffer space.
*/
-unsigned rxrpc_idle_ack_delay = 0.5 * HZ;
+unsigned int rxrpc_idle_ack_delay = 0.5 * HZ;
/*
* Receive window size in packets. This indicates the maximum number of
@@ -49,19 +49,19 @@ unsigned rxrpc_idle_ack_delay = 0.5 * HZ;
* limit is hit, we should generate an EXCEEDS_WINDOW ACK and discard further
* packets.
*/
-unsigned rxrpc_rx_window_size = 32;
+unsigned int rxrpc_rx_window_size = 32;
/*
* Maximum Rx MTU size. This indicates to the sender the size of jumbo packet
* made by gluing normal packets together that we're willing to handle.
*/
-unsigned rxrpc_rx_mtu = 5692;
+unsigned int rxrpc_rx_mtu = 5692;
/*
* The maximum number of fragments in a received jumbo packet that we tell the
* sender that we're willing to handle.
*/
-unsigned rxrpc_rx_jumbo_max = 4;
+unsigned int rxrpc_rx_jumbo_max = 4;
static const char *rxrpc_acks(u8 reason)
{
@@ -91,7 +91,7 @@ static const s8 rxrpc_ack_priority[] = {
* propose an ACK be sent
*/
void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
- __be32 serial, bool immediate)
+ u32 serial, bool immediate)
{
unsigned long expiry;
s8 prior = rxrpc_ack_priority[ack_reason];
@@ -99,8 +99,7 @@ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
ASSERTCMP(prior, >, 0);
_enter("{%d},%s,%%%x,%u",
- call->debug_id, rxrpc_acks(ack_reason), ntohl(serial),
- immediate);
+ call->debug_id, rxrpc_acks(ack_reason), serial, immediate);
if (prior < rxrpc_ack_priority[call->ackr_reason]) {
if (immediate)
@@ -139,7 +138,7 @@ void __rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
expiry = rxrpc_requested_ack_delay;
if (!expiry)
goto cancel_timer;
- if (!immediate || serial == cpu_to_be32(1)) {
+ if (!immediate || serial == 1) {
_debug("run defer timer");
goto run_timer;
}
@@ -157,11 +156,11 @@ run_timer:
return;
cancel_timer:
- _debug("cancel timer %%%u", ntohl(serial));
+ _debug("cancel timer %%%u", serial);
try_to_del_timer_sync(&call->ack_timer);
read_lock_bh(&call->state_lock);
if (call->state <= RXRPC_CALL_COMPLETE &&
- !test_and_set_bit(RXRPC_CALL_ACK, &call->events))
+ !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
rxrpc_queue_call(call);
read_unlock_bh(&call->state_lock);
}
@@ -170,7 +169,7 @@ cancel_timer:
* propose an ACK be sent, locking the call structure
*/
void rxrpc_propose_ACK(struct rxrpc_call *call, u8 ack_reason,
- __be32 serial, bool immediate)
+ u32 serial, bool immediate)
{
s8 prior = rxrpc_ack_priority[ack_reason];
@@ -193,7 +192,7 @@ static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend,
if (resend & 1) {
_debug("SET RESEND");
- set_bit(RXRPC_CALL_RESEND, &call->events);
+ set_bit(RXRPC_CALL_EV_RESEND, &call->events);
}
if (resend & 2) {
@@ -203,7 +202,7 @@ static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend,
} else {
_debug("KILL RESEND TIMER");
del_timer_sync(&call->resend_timer);
- clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
+ clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
}
read_unlock_bh(&call->state_lock);
@@ -214,8 +213,8 @@ static void rxrpc_set_resend(struct rxrpc_call *call, u8 resend,
*/
static void rxrpc_resend(struct rxrpc_call *call)
{
+ struct rxrpc_wire_header *whdr;
struct rxrpc_skb_priv *sp;
- struct rxrpc_header *hdr;
struct sk_buff *txb;
unsigned long *p_txb, resend_at;
bool stop;
@@ -247,14 +246,13 @@ static void rxrpc_resend(struct rxrpc_call *call)
sp->need_resend = false;
/* each Tx packet has a new serial number */
- sp->hdr.serial =
- htonl(atomic_inc_return(&call->conn->serial));
+ sp->hdr.serial = atomic_inc_return(&call->conn->serial);
- hdr = (struct rxrpc_header *) txb->head;
- hdr->serial = sp->hdr.serial;
+ whdr = (struct rxrpc_wire_header *)txb->head;
+ whdr->serial = htonl(sp->hdr.serial);
_proto("Tx DATA %%%u { #%d }",
- ntohl(sp->hdr.serial), ntohl(sp->hdr.seq));
+ sp->hdr.serial, sp->hdr.seq);
if (rxrpc_send_packet(call->conn->trans, txb) < 0) {
stop = true;
sp->resend_at = jiffies + 3;
@@ -428,7 +426,7 @@ static void rxrpc_rotate_tx_window(struct rxrpc_call *call, u32 hard)
int tail = call->acks_tail, old_tail;
int win = CIRC_CNT(call->acks_head, tail, call->acks_winsz);
- _enter("{%u,%u},%u", call->acks_hard, win, hard);
+ kenter("{%u,%u},%u", call->acks_hard, win, hard);
ASSERTCMP(hard - call->acks_hard, <=, win);
@@ -478,11 +476,11 @@ static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call)
sp = rxrpc_skb(skb);
_debug("drain OOS packet %d [%d]",
- ntohl(sp->hdr.seq), call->rx_first_oos);
+ sp->hdr.seq, call->rx_first_oos);
- if (ntohl(sp->hdr.seq) != call->rx_first_oos) {
+ if (sp->hdr.seq != call->rx_first_oos) {
skb_queue_head(&call->rx_oos_queue, skb);
- call->rx_first_oos = ntohl(rxrpc_skb(skb)->hdr.seq);
+ call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
_debug("requeue %p {%u}", skb, call->rx_first_oos);
} else {
skb->mark = RXRPC_SKB_MARK_DATA;
@@ -496,8 +494,7 @@ static int rxrpc_drain_rx_oos_queue(struct rxrpc_call *call)
/* find out what the next packet is */
skb = skb_peek(&call->rx_oos_queue);
if (skb)
- call->rx_first_oos =
- ntohl(rxrpc_skb(skb)->hdr.seq);
+ call->rx_first_oos = rxrpc_skb(skb)->hdr.seq;
else
call->rx_first_oos = 0;
_debug("peek %p {%u}", skb, call->rx_first_oos);
@@ -522,7 +519,7 @@ static void rxrpc_insert_oos_packet(struct rxrpc_call *call,
u32 seq;
sp = rxrpc_skb(skb);
- seq = ntohl(sp->hdr.seq);
+ seq = sp->hdr.seq;
_enter(",,{%u}", seq);
skb->destructor = rxrpc_packet_destructor;
@@ -535,9 +532,8 @@ static void rxrpc_insert_oos_packet(struct rxrpc_call *call,
skb_queue_walk(&call->rx_oos_queue, p) {
psp = rxrpc_skb(p);
- if (ntohl(psp->hdr.seq) > seq) {
- _debug("insert oos #%u before #%u",
- seq, ntohl(psp->hdr.seq));
+ if (psp->hdr.seq > seq) {
+ _debug("insert oos #%u before #%u", seq, psp->hdr.seq);
skb_insert(p, skb, &call->rx_oos_queue);
goto inserted;
}
@@ -555,7 +551,7 @@ inserted:
if (call->state < RXRPC_CALL_COMPLETE &&
call->rx_data_post == call->rx_first_oos) {
_debug("drain rx oos now");
- set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events);
+ set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
}
read_unlock(&call->state_lock);
@@ -586,7 +582,7 @@ static void rxrpc_zap_tx_window(struct rxrpc_call *call)
skb = (struct sk_buff *) _skb;
sp = rxrpc_skb(skb);
- _debug("+++ clear Tx %u", ntohl(sp->hdr.seq));
+ _debug("+++ clear Tx %u", sp->hdr.seq);
rxrpc_free_skb(skb);
}
@@ -657,8 +653,7 @@ process_further:
/* data packets that wind up here have been received out of
* order, need security processing or are jumbo packets */
case RXRPC_PACKET_TYPE_DATA:
- _proto("OOSQ DATA %%%u { #%u }",
- ntohl(sp->hdr.serial), ntohl(sp->hdr.seq));
+ _proto("OOSQ DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
/* secured packets must be verified and possibly decrypted */
if (rxrpc_verify_packet(call, skb, _abort_code) < 0)
@@ -676,7 +671,7 @@ process_further:
if (!skb_pull(skb, sizeof(ack)))
BUG();
- latest = ntohl(sp->hdr.serial);
+ latest = sp->hdr.serial;
hard = ntohl(ack.firstPacket);
tx = atomic_read(&call->sequence);
@@ -723,8 +718,10 @@ process_further:
if ((call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY ||
call->state == RXRPC_CALL_SERVER_AWAIT_ACK) &&
- hard > tx)
+ hard > tx) {
+ call->acks_hard = tx;
goto all_acked;
+ }
smp_rmb();
rxrpc_rotate_tx_window(call, hard - 1);
@@ -791,7 +788,7 @@ all_acked:
del_timer_sync(&call->resend_timer);
clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
+ clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
if (call->acks_window)
rxrpc_zap_tx_window(call);
@@ -879,16 +876,17 @@ void rxrpc_process_call(struct work_struct *work)
{
struct rxrpc_call *call =
container_of(work, struct rxrpc_call, processor);
+ struct rxrpc_wire_header whdr;
struct rxrpc_ackpacket ack;
struct rxrpc_ackinfo ackinfo;
- struct rxrpc_header hdr;
struct msghdr msg;
struct kvec iov[5];
+ enum rxrpc_call_event genbit;
unsigned long bits;
__be32 data, pad;
size_t len;
- int genbit, loop, nbit, ioc, ret, mtu;
- u32 abort_code = RX_PROTOCOL_ERROR;
+ int loop, nbit, ioc, ret, mtu;
+ u32 serial, abort_code = RX_PROTOCOL_ERROR;
u8 *acks = NULL;
//printk("\n--------------------\n");
@@ -909,33 +907,33 @@ void rxrpc_process_call(struct work_struct *work)
msg.msg_controllen = 0;
msg.msg_flags = 0;
- hdr.epoch = call->conn->epoch;
- hdr.cid = call->cid;
- hdr.callNumber = call->call_id;
- hdr.seq = 0;
- hdr.type = RXRPC_PACKET_TYPE_ACK;
- hdr.flags = call->conn->out_clientflag;
- hdr.userStatus = 0;
- hdr.securityIndex = call->conn->security_ix;
- hdr._rsvd = 0;
- hdr.serviceId = call->conn->service_id;
+ whdr.epoch = htonl(call->conn->epoch);
+ whdr.cid = htonl(call->cid);
+ whdr.callNumber = htonl(call->call_id);
+ whdr.seq = 0;
+ whdr.type = RXRPC_PACKET_TYPE_ACK;
+ whdr.flags = call->conn->out_clientflag;
+ whdr.userStatus = 0;
+ whdr.securityIndex = call->conn->security_ix;
+ whdr._rsvd = 0;
+ whdr.serviceId = htons(call->service_id);
memset(iov, 0, sizeof(iov));
- iov[0].iov_base = &hdr;
- iov[0].iov_len = sizeof(hdr);
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
/* deal with events of a final nature */
- if (test_bit(RXRPC_CALL_RELEASE, &call->events)) {
+ if (test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
rxrpc_release_call(call);
- clear_bit(RXRPC_CALL_RELEASE, &call->events);
+ clear_bit(RXRPC_CALL_EV_RELEASE, &call->events);
}
- if (test_bit(RXRPC_CALL_RCVD_ERROR, &call->events)) {
+ if (test_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events)) {
int error;
- clear_bit(RXRPC_CALL_CONN_ABORT, &call->events);
- clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events);
- clear_bit(RXRPC_CALL_ABORT, &call->events);
+ clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
+ clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
+ clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
error = call->conn->trans->peer->net_error;
_debug("post net error %d", error);
@@ -943,47 +941,47 @@ void rxrpc_process_call(struct work_struct *work)
if (rxrpc_post_message(call, RXRPC_SKB_MARK_NET_ERROR,
error, true) < 0)
goto no_mem;
- clear_bit(RXRPC_CALL_RCVD_ERROR, &call->events);
+ clear_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
goto kill_ACKs;
}
- if (test_bit(RXRPC_CALL_CONN_ABORT, &call->events)) {
+ if (test_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events)) {
ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE);
- clear_bit(RXRPC_CALL_REJECT_BUSY, &call->events);
- clear_bit(RXRPC_CALL_ABORT, &call->events);
+ clear_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events);
+ clear_bit(RXRPC_CALL_EV_ABORT, &call->events);
_debug("post conn abort");
if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
call->conn->error, true) < 0)
goto no_mem;
- clear_bit(RXRPC_CALL_CONN_ABORT, &call->events);
+ clear_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
goto kill_ACKs;
}
- if (test_bit(RXRPC_CALL_REJECT_BUSY, &call->events)) {
- hdr.type = RXRPC_PACKET_TYPE_BUSY;
- genbit = RXRPC_CALL_REJECT_BUSY;
+ if (test_bit(RXRPC_CALL_EV_REJECT_BUSY, &call->events)) {
+ whdr.type = RXRPC_PACKET_TYPE_BUSY;
+ genbit = RXRPC_CALL_EV_REJECT_BUSY;
goto send_message;
}
- if (test_bit(RXRPC_CALL_ABORT, &call->events)) {
+ if (test_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
ASSERTCMP(call->state, >, RXRPC_CALL_COMPLETE);
if (rxrpc_post_message(call, RXRPC_SKB_MARK_LOCAL_ERROR,
ECONNABORTED, true) < 0)
goto no_mem;
- hdr.type = RXRPC_PACKET_TYPE_ABORT;
+ whdr.type = RXRPC_PACKET_TYPE_ABORT;
data = htonl(call->abort_code);
iov[1].iov_base = &data;
iov[1].iov_len = sizeof(data);
- genbit = RXRPC_CALL_ABORT;
+ genbit = RXRPC_CALL_EV_ABORT;
goto send_message;
}
- if (test_bit(RXRPC_CALL_ACK_FINAL, &call->events)) {
- genbit = RXRPC_CALL_ACK_FINAL;
+ if (test_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events)) {
+ genbit = RXRPC_CALL_EV_ACK_FINAL;
ack.bufferSpace = htons(8);
ack.maxSkew = 0;
@@ -993,9 +991,9 @@ void rxrpc_process_call(struct work_struct *work)
call->ackr_reason = 0;
spin_lock_bh(&call->lock);
- ack.serial = call->ackr_serial;
- ack.previousPacket = call->ackr_prev_seq;
- ack.firstPacket = htonl(call->rx_data_eaten + 1);
+ ack.serial = htonl(call->ackr_serial);
+ ack.previousPacket = htonl(call->ackr_prev_seq);
+ ack.firstPacket = htonl(call->rx_data_eaten + 1);
spin_unlock_bh(&call->lock);
pad = 0;
@@ -1009,12 +1007,12 @@ void rxrpc_process_call(struct work_struct *work)
goto send_ACK;
}
- if (call->events & ((1 << RXRPC_CALL_RCVD_BUSY) |
- (1 << RXRPC_CALL_RCVD_ABORT))
+ if (call->events & ((1 << RXRPC_CALL_EV_RCVD_BUSY) |
+ (1 << RXRPC_CALL_EV_RCVD_ABORT))
) {
u32 mark;
- if (test_bit(RXRPC_CALL_RCVD_ABORT, &call->events))
+ if (test_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events))
mark = RXRPC_SKB_MARK_REMOTE_ABORT;
else
mark = RXRPC_SKB_MARK_BUSY;
@@ -1024,22 +1022,22 @@ void rxrpc_process_call(struct work_struct *work)
if (rxrpc_post_message(call, mark, ECONNABORTED, true) < 0)
goto no_mem;
- clear_bit(RXRPC_CALL_RCVD_BUSY, &call->events);
- clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+ clear_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
+ clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
goto kill_ACKs;
}
- if (test_and_clear_bit(RXRPC_CALL_RCVD_ACKALL, &call->events)) {
+ if (test_and_clear_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events)) {
_debug("do implicit ackall");
rxrpc_clear_tx_window(call);
}
- if (test_bit(RXRPC_CALL_LIFE_TIMER, &call->events)) {
+ if (test_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events)) {
write_lock_bh(&call->state_lock);
if (call->state <= RXRPC_CALL_COMPLETE) {
call->state = RXRPC_CALL_LOCALLY_ABORTED;
call->abort_code = RX_CALL_TIMEOUT;
- set_bit(RXRPC_CALL_ABORT, &call->events);
+ set_bit(RXRPC_CALL_EV_ABORT, &call->events);
}
write_unlock_bh(&call->state_lock);
@@ -1048,7 +1046,7 @@ void rxrpc_process_call(struct work_struct *work)
ETIME, true) < 0)
goto no_mem;
- clear_bit(RXRPC_CALL_LIFE_TIMER, &call->events);
+ clear_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events);
goto kill_ACKs;
}
@@ -1069,13 +1067,13 @@ void rxrpc_process_call(struct work_struct *work)
}
/* handle resending */
- if (test_and_clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events))
+ if (test_and_clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
rxrpc_resend_timer(call);
- if (test_and_clear_bit(RXRPC_CALL_RESEND, &call->events))
+ if (test_and_clear_bit(RXRPC_CALL_EV_RESEND, &call->events))
rxrpc_resend(call);
/* consider sending an ordinary ACK */
- if (test_bit(RXRPC_CALL_ACK, &call->events)) {
+ if (test_bit(RXRPC_CALL_EV_ACK, &call->events)) {
_debug("send ACK: window: %d - %d { %lx }",
call->rx_data_eaten, call->ackr_win_top,
call->ackr_window[0]);
@@ -1083,11 +1081,11 @@ void rxrpc_process_call(struct work_struct *work)
if (call->state > RXRPC_CALL_SERVER_ACK_REQUEST &&
call->ackr_reason != RXRPC_ACK_PING_RESPONSE) {
/* ACK by sending reply DATA packet in this state */
- clear_bit(RXRPC_CALL_ACK, &call->events);
+ clear_bit(RXRPC_CALL_EV_ACK, &call->events);
goto maybe_reschedule;
}
- genbit = RXRPC_CALL_ACK;
+ genbit = RXRPC_CALL_EV_ACK;
acks = kzalloc(call->ackr_win_top - call->rx_data_eaten,
GFP_NOFS);
@@ -1097,13 +1095,11 @@ void rxrpc_process_call(struct work_struct *work)
//hdr.flags = RXRPC_SLOW_START_OK;
ack.bufferSpace = htons(8);
ack.maxSkew = 0;
- ack.serial = 0;
- ack.reason = 0;
spin_lock_bh(&call->lock);
- ack.reason = call->ackr_reason;
- ack.serial = call->ackr_serial;
- ack.previousPacket = call->ackr_prev_seq;
+ ack.reason = call->ackr_reason;
+ ack.serial = htonl(call->ackr_serial);
+ ack.previousPacket = htonl(call->ackr_prev_seq);
ack.firstPacket = htonl(call->rx_data_eaten + 1);
ack.nAcks = 0;
@@ -1150,7 +1146,7 @@ void rxrpc_process_call(struct work_struct *work)
/* handle completion of security negotiations on an incoming
* connection */
- if (test_and_clear_bit(RXRPC_CALL_SECURED, &call->events)) {
+ if (test_and_clear_bit(RXRPC_CALL_EV_SECURED, &call->events)) {
_debug("secured");
spin_lock_bh(&call->lock);
@@ -1158,7 +1154,7 @@ void rxrpc_process_call(struct work_struct *work)
_debug("securing");
write_lock(&call->conn->lock);
if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_bit(RXRPC_CALL_RELEASE, &call->events)) {
+ !test_bit(RXRPC_CALL_EV_RELEASE, &call->events)) {
_debug("not released");
call->state = RXRPC_CALL_SERVER_ACCEPTING;
list_move_tail(&call->accept_link,
@@ -1167,39 +1163,39 @@ void rxrpc_process_call(struct work_struct *work)
write_unlock(&call->conn->lock);
read_lock(&call->state_lock);
if (call->state < RXRPC_CALL_COMPLETE)
- set_bit(RXRPC_CALL_POST_ACCEPT, &call->events);
+ set_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
read_unlock(&call->state_lock);
}
spin_unlock_bh(&call->lock);
- if (!test_bit(RXRPC_CALL_POST_ACCEPT, &call->events))
+ if (!test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events))
goto maybe_reschedule;
}
/* post a notification of an acceptable connection to the app */
- if (test_bit(RXRPC_CALL_POST_ACCEPT, &call->events)) {
+ if (test_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events)) {
_debug("post accept");
if (rxrpc_post_message(call, RXRPC_SKB_MARK_NEW_CALL,
0, false) < 0)
goto no_mem;
- clear_bit(RXRPC_CALL_POST_ACCEPT, &call->events);
+ clear_bit(RXRPC_CALL_EV_POST_ACCEPT, &call->events);
goto maybe_reschedule;
}
/* handle incoming call acceptance */
- if (test_and_clear_bit(RXRPC_CALL_ACCEPTED, &call->events)) {
+ if (test_and_clear_bit(RXRPC_CALL_EV_ACCEPTED, &call->events)) {
_debug("accepted");
ASSERTCMP(call->rx_data_post, ==, 0);
call->rx_data_post = 1;
read_lock_bh(&call->state_lock);
if (call->state < RXRPC_CALL_COMPLETE)
- set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events);
+ set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events);
read_unlock_bh(&call->state_lock);
}
/* drain the out of sequence received packet queue into the packet Rx
* queue */
- if (test_and_clear_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events)) {
+ if (test_and_clear_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events)) {
while (call->rx_data_post == call->rx_first_oos)
if (rxrpc_drain_rx_oos_queue(call) < 0)
break;
@@ -1222,9 +1218,10 @@ send_ACK:
ackinfo.rxMTU = htonl(rxrpc_rx_mtu);
ackinfo.jumbo_max = htonl(rxrpc_rx_jumbo_max);
- hdr.serial = htonl(atomic_inc_return(&call->conn->serial));
+ serial = atomic_inc_return(&call->conn->serial);
+ whdr.serial = htonl(serial);
_proto("Tx ACK %%%u { m=%hu f=#%u p=#%u s=%%%u r=%s n=%u }",
- ntohl(hdr.serial),
+ serial,
ntohs(ack.maxSkew),
ntohl(ack.firstPacket),
ntohl(ack.previousPacket),
@@ -1240,8 +1237,9 @@ send_ACK:
send_message:
_debug("send message");
- hdr.serial = htonl(atomic_inc_return(&call->conn->serial));
- _proto("Tx %s %%%u", rxrpc_pkts[hdr.type], ntohl(hdr.serial));
+ serial = atomic_inc_return(&call->conn->serial);
+ whdr.serial = htonl(serial);
+ _proto("Tx %s %%%u", rxrpc_pkts[whdr.type], serial);
send_message_2:
len = iov[0].iov_len;
@@ -1278,12 +1276,12 @@ send_message_2:
}
switch (genbit) {
- case RXRPC_CALL_ABORT:
+ case RXRPC_CALL_EV_ABORT:
clear_bit(genbit, &call->events);
- clear_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+ clear_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
goto kill_ACKs;
- case RXRPC_CALL_ACK_FINAL:
+ case RXRPC_CALL_EV_ACK_FINAL:
write_lock_bh(&call->state_lock);
if (call->state == RXRPC_CALL_CLIENT_FINAL_ACK)
call->state = RXRPC_CALL_COMPLETE;
@@ -1308,9 +1306,9 @@ send_message_2:
kill_ACKs:
del_timer_sync(&call->ack_timer);
- if (test_and_clear_bit(RXRPC_CALL_ACK_FINAL, &call->events))
+ if (test_and_clear_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events))
rxrpc_put_call(call);
- clear_bit(RXRPC_CALL_ACK, &call->events);
+ clear_bit(RXRPC_CALL_EV_ACK, &call->events);
maybe_reschedule:
if (call->events || !skb_queue_empty(&call->rx_queue)) {
@@ -1324,12 +1322,11 @@ maybe_reschedule:
if (call->state >= RXRPC_CALL_COMPLETE &&
!list_empty(&call->accept_link)) {
_debug("X unlinking once-pending call %p { e=%lx f=%lx c=%x }",
- call, call->events, call->flags,
- ntohl(call->conn->cid));
+ call, call->events, call->flags, call->conn->cid);
read_lock_bh(&call->state_lock);
if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+ !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
rxrpc_queue_call(call);
read_unlock_bh(&call->state_lock);
}
@@ -1343,7 +1340,7 @@ error:
* this means there's a race between clearing the flag and setting the
* work pending bit and the work item being processed again */
if (call->events && !work_pending(&call->processor)) {
- _debug("jumpstart %x", ntohl(call->conn->cid));
+ _debug("jumpstart %x", call->conn->cid);
rxrpc_queue_call(call);
}
diff --git a/net/rxrpc/ar-call.c b/net/rxrpc/ar-call.c
index a9e05db0f5d5..7c8d300ade9b 100644
--- a/net/rxrpc/ar-call.c
+++ b/net/rxrpc/ar-call.c
@@ -21,14 +21,14 @@
/*
* Maximum lifetime of a call (in jiffies).
*/
-unsigned rxrpc_max_call_lifetime = 60 * HZ;
+unsigned int rxrpc_max_call_lifetime = 60 * HZ;
/*
* Time till dead call expires after last use (in jiffies).
*/
-unsigned rxrpc_dead_call_expiry = 2 * HZ;
+unsigned int rxrpc_dead_call_expiry = 2 * HZ;
-const char *const rxrpc_call_states[] = {
+const char *const rxrpc_call_states[NR__RXRPC_CALL_STATES] = {
[RXRPC_CALL_CLIENT_SEND_REQUEST] = "ClSndReq",
[RXRPC_CALL_CLIENT_AWAIT_REPLY] = "ClAwtRpl",
[RXRPC_CALL_CLIENT_RECV_REPLY] = "ClRcvRpl",
@@ -64,11 +64,11 @@ static DEFINE_HASHTABLE(rxrpc_call_hash, 10);
* Hash function for rxrpc_call_hash
*/
static unsigned long rxrpc_call_hashfunc(
- u8 clientflag,
- __be32 cid,
- __be32 call_id,
- __be32 epoch,
- __be16 service_id,
+ u8 in_clientflag,
+ u32 cid,
+ u32 call_id,
+ u32 epoch,
+ u16 service_id,
sa_family_t proto,
void *localptr,
unsigned int addr_size,
@@ -77,7 +77,6 @@ static unsigned long rxrpc_call_hashfunc(
const u16 *p;
unsigned int i;
unsigned long key;
- u32 hcid = ntohl(cid);
_enter("");
@@ -85,12 +84,12 @@ static unsigned long rxrpc_call_hashfunc(
/* We just want to add up the __be32 values, so forcing the
* cast should be okay.
*/
- key += (__force u32)epoch;
- key += (__force u16)service_id;
- key += (__force u32)call_id;
- key += (hcid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT;
- key += hcid & RXRPC_CHANNELMASK;
- key += clientflag;
+ key += epoch;
+ key += service_id;
+ key += call_id;
+ key += (cid & RXRPC_CIDMASK) >> RXRPC_CIDSHIFT;
+ key += cid & RXRPC_CHANNELMASK;
+ key += in_clientflag;
key += proto;
/* Step through the peer address in 16-bit portions for speed */
for (i = 0, p = (const u16 *)peer_addr; i < addr_size >> 1; i++, p++)
@@ -148,19 +147,16 @@ static void rxrpc_call_hash_del(struct rxrpc_call *call)
* isn't there.
*/
struct rxrpc_call *rxrpc_find_call_hash(
- u8 clientflag,
- __be32 cid,
- __be32 call_id,
- __be32 epoch,
- __be16 service_id,
+ struct rxrpc_host_header *hdr,
void *localptr,
sa_family_t proto,
- const u8 *peer_addr)
+ const void *peer_addr)
{
unsigned long key;
unsigned int addr_size = 0;
struct rxrpc_call *call = NULL;
struct rxrpc_call *ret = NULL;
+ u8 in_clientflag = hdr->flags & RXRPC_CLIENT_INITIATED;
_enter("");
switch (proto) {
@@ -174,20 +170,21 @@ struct rxrpc_call *rxrpc_find_call_hash(
break;
}
- key = rxrpc_call_hashfunc(clientflag, cid, call_id, epoch,
- service_id, proto, localptr, addr_size,
+ key = rxrpc_call_hashfunc(in_clientflag, hdr->cid, hdr->callNumber,
+ hdr->epoch, hdr->serviceId,
+ proto, localptr, addr_size,
peer_addr);
hash_for_each_possible_rcu(rxrpc_call_hash, call, hash_node, key) {
if (call->hash_key == key &&
- call->call_id == call_id &&
- call->cid == cid &&
- call->in_clientflag == clientflag &&
- call->service_id == service_id &&
+ call->call_id == hdr->callNumber &&
+ call->cid == hdr->cid &&
+ call->in_clientflag == in_clientflag &&
+ call->service_id == hdr->serviceId &&
call->proto == proto &&
call->local == localptr &&
memcmp(call->peer_ip.ipv6_addr, peer_addr,
- addr_size) == 0 &&
- call->epoch == epoch) {
+ addr_size) == 0 &&
+ call->epoch == hdr->epoch) {
ret = call;
break;
}
@@ -414,12 +411,12 @@ found_extant_second:
*/
struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
struct rxrpc_connection *conn,
- struct rxrpc_header *hdr,
+ struct rxrpc_host_header *hdr,
gfp_t gfp)
{
struct rxrpc_call *call, *candidate;
struct rb_node **p, *parent;
- __be32 call_id;
+ u32 call_id;
_enter(",%d,,%x", conn->debug_id, gfp);
@@ -433,7 +430,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
candidate->conn = conn;
candidate->cid = hdr->cid;
candidate->call_id = hdr->callNumber;
- candidate->channel = ntohl(hdr->cid) & RXRPC_CHANNELMASK;
+ candidate->channel = hdr->cid & RXRPC_CHANNELMASK;
candidate->rx_data_post = 0;
candidate->state = RXRPC_CALL_SERVER_ACCEPTING;
if (conn->security_ix > 0)
@@ -452,7 +449,7 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
read_lock(&call->state_lock);
switch (call->state) {
case RXRPC_CALL_LOCALLY_ABORTED:
- if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events))
+ if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events))
rxrpc_queue_call(call);
case RXRPC_CALL_REMOTELY_ABORTED:
read_unlock(&call->state_lock);
@@ -492,9 +489,9 @@ struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *rx,
/* The tree is sorted in order of the __be32 value without
* turning it into host order.
*/
- if ((__force u32)call_id < (__force u32)call->call_id)
+ if (call_id < call->call_id)
p = &(*p)->rb_left;
- else if ((__force u32)call_id > (__force u32)call->call_id)
+ else if (call_id > call->call_id)
p = &(*p)->rb_right;
else
goto old_call;
@@ -686,7 +683,7 @@ void rxrpc_release_call(struct rxrpc_call *call)
_debug("+++ ABORTING STATE %d +++\n", call->state);
call->state = RXRPC_CALL_LOCALLY_ABORTED;
call->abort_code = RX_CALL_DEAD;
- set_bit(RXRPC_CALL_ABORT, &call->events);
+ set_bit(RXRPC_CALL_EV_ABORT, &call->events);
rxrpc_queue_call(call);
}
write_unlock(&call->state_lock);
@@ -714,8 +711,7 @@ void rxrpc_release_call(struct rxrpc_call *call)
_debug("- zap %s %%%u #%u",
rxrpc_pkts[sp->hdr.type],
- ntohl(sp->hdr.serial),
- ntohl(sp->hdr.seq));
+ sp->hdr.serial, sp->hdr.seq);
rxrpc_free_skb(skb);
spin_lock_bh(&call->lock);
}
@@ -763,10 +759,10 @@ static void rxrpc_mark_call_released(struct rxrpc_call *call)
_debug("abort call %p", call);
call->state = RXRPC_CALL_LOCALLY_ABORTED;
call->abort_code = RX_CALL_DEAD;
- if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events))
+ if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events))
sched = true;
}
- if (!test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+ if (!test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
sched = true;
if (sched)
rxrpc_queue_call(call);
@@ -873,9 +869,9 @@ static void rxrpc_cleanup_call(struct rxrpc_call *call)
unsigned long _skb;
_skb = call->acks_window[call->acks_tail] & ~1;
- sp = rxrpc_skb((struct sk_buff *) _skb);
- _debug("+++ clear Tx %u", ntohl(sp->hdr.seq));
- rxrpc_free_skb((struct sk_buff *) _skb);
+ sp = rxrpc_skb((struct sk_buff *)_skb);
+ _debug("+++ clear Tx %u", sp->hdr.seq);
+ rxrpc_free_skb((struct sk_buff *)_skb);
call->acks_tail =
(call->acks_tail + 1) & (call->acks_winsz - 1);
}
@@ -975,7 +971,7 @@ static void rxrpc_call_life_expired(unsigned long _call)
_enter("{%d}", call->debug_id);
read_lock_bh(&call->state_lock);
if (call->state < RXRPC_CALL_COMPLETE) {
- set_bit(RXRPC_CALL_LIFE_TIMER, &call->events);
+ set_bit(RXRPC_CALL_EV_LIFE_TIMER, &call->events);
rxrpc_queue_call(call);
}
read_unlock_bh(&call->state_lock);
@@ -995,7 +991,7 @@ static void rxrpc_resend_time_expired(unsigned long _call)
return;
clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
- if (!test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events))
+ if (!test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
rxrpc_queue_call(call);
}
@@ -1013,7 +1009,7 @@ static void rxrpc_ack_time_expired(unsigned long _call)
read_lock_bh(&call->state_lock);
if (call->state < RXRPC_CALL_COMPLETE &&
- !test_and_set_bit(RXRPC_CALL_ACK, &call->events))
+ !test_and_set_bit(RXRPC_CALL_EV_ACK, &call->events))
rxrpc_queue_call(call);
read_unlock_bh(&call->state_lock);
}
diff --git a/net/rxrpc/ar-connection.c b/net/rxrpc/ar-connection.c
index 692b3e67fb54..9942da1edbf6 100644
--- a/net/rxrpc/ar-connection.c
+++ b/net/rxrpc/ar-connection.c
@@ -21,7 +21,7 @@
/*
* Time till a connection expires after last use (in seconds).
*/
-unsigned rxrpc_connection_expiry = 10 * 60;
+unsigned int rxrpc_connection_expiry = 10 * 60;
static void rxrpc_connection_reaper(struct work_struct *work);
@@ -57,10 +57,10 @@ static struct rxrpc_conn_bundle *rxrpc_alloc_bundle(gfp_t gfp)
*/
static inline
int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle,
- struct key *key, __be16 service_id)
+ struct key *key, u16 service_id)
{
return (bundle->service_id - service_id) ?:
- ((unsigned long) bundle->key - (unsigned long) key);
+ ((unsigned long)bundle->key - (unsigned long)key);
}
/*
@@ -69,14 +69,14 @@ int rxrpc_cmp_bundle(const struct rxrpc_conn_bundle *bundle,
struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *rx,
struct rxrpc_transport *trans,
struct key *key,
- __be16 service_id,
+ u16 service_id,
gfp_t gfp)
{
struct rxrpc_conn_bundle *bundle, *candidate;
struct rb_node *p, *parent, **pp;
_enter("%p{%x},%x,%hx,",
- rx, key_serial(key), trans->debug_id, ntohs(service_id));
+ rx, key_serial(key), trans->debug_id, service_id);
if (rx->trans == trans && rx->bundle) {
atomic_inc(&rx->bundle->usage);
@@ -213,7 +213,7 @@ static struct rxrpc_connection *rxrpc_alloc_connection(gfp_t gfp)
conn->debug_id = atomic_inc_return(&rxrpc_debug_id);
conn->avail_calls = RXRPC_MAXCALLS;
conn->size_align = 4;
- conn->header_size = sizeof(struct rxrpc_header);
+ conn->header_size = sizeof(struct rxrpc_wire_header);
}
_leave(" = %p{%d}", conn, conn ? conn->debug_id : 0);
@@ -230,7 +230,7 @@ static void rxrpc_assign_connection_id(struct rxrpc_connection *conn)
struct rxrpc_connection *xconn;
struct rb_node *parent, **p;
__be32 epoch;
- u32 real_conn_id;
+ u32 cid;
_enter("");
@@ -241,7 +241,7 @@ static void rxrpc_assign_connection_id(struct rxrpc_connection *conn)
conn->trans->conn_idcounter += RXRPC_CID_INC;
if (conn->trans->conn_idcounter < RXRPC_CID_INC)
conn->trans->conn_idcounter = RXRPC_CID_INC;
- real_conn_id = conn->trans->conn_idcounter;
+ cid = conn->trans->conn_idcounter;
attempt_insertion:
parent = NULL;
@@ -255,9 +255,9 @@ attempt_insertion:
p = &(*p)->rb_left;
else if (epoch > xconn->epoch)
p = &(*p)->rb_right;
- else if (real_conn_id < xconn->real_conn_id)
+ else if (cid < xconn->cid)
p = &(*p)->rb_left;
- else if (real_conn_id > xconn->real_conn_id)
+ else if (cid > xconn->cid)
p = &(*p)->rb_right;
else
goto id_exists;
@@ -268,20 +268,19 @@ attempt_insertion:
rb_link_node(&conn->node, parent, p);
rb_insert_color(&conn->node, &conn->trans->client_conns);
- conn->real_conn_id = real_conn_id;
- conn->cid = htonl(real_conn_id);
+ conn->cid = cid;
write_unlock_bh(&conn->trans->conn_lock);
- _leave(" [CONNID %x CID %x]", real_conn_id, ntohl(conn->cid));
+ _leave(" [CID %x]", cid);
return;
/* we found a connection with the proposed ID - walk the tree from that
* point looking for the next unused ID */
id_exists:
for (;;) {
- real_conn_id += RXRPC_CID_INC;
- if (real_conn_id < RXRPC_CID_INC) {
- real_conn_id = RXRPC_CID_INC;
- conn->trans->conn_idcounter = real_conn_id;
+ cid += RXRPC_CID_INC;
+ if (cid < RXRPC_CID_INC) {
+ cid = RXRPC_CID_INC;
+ conn->trans->conn_idcounter = cid;
goto attempt_insertion;
}
@@ -291,7 +290,7 @@ id_exists:
xconn = rb_entry(parent, struct rxrpc_connection, node);
if (epoch < xconn->epoch ||
- real_conn_id < xconn->real_conn_id)
+ cid < xconn->cid)
goto attempt_insertion;
}
}
@@ -334,7 +333,7 @@ static void rxrpc_add_call_ID_to_conn(struct rxrpc_connection *conn,
*/
static int rxrpc_connect_exclusive(struct rxrpc_sock *rx,
struct rxrpc_transport *trans,
- __be16 service_id,
+ u16 service_id,
struct rxrpc_call *call,
gfp_t gfp)
{
@@ -404,11 +403,11 @@ found_channel:
conn->channels[chan] = call;
call->conn = conn;
call->channel = chan;
- call->cid = conn->cid | htonl(chan);
- call->call_id = htonl(++conn->call_counter);
+ call->cid = conn->cid | chan;
+ call->call_id = ++conn->call_counter;
_net("CONNECT client on conn %d chan %d as call %x",
- conn->debug_id, chan, ntohl(call->call_id));
+ conn->debug_id, chan, call->call_id);
spin_unlock(&trans->client_lock);
@@ -500,7 +499,7 @@ int rxrpc_connect_call(struct rxrpc_sock *rx,
if (bundle->num_conns >= 20) {
_debug("too many conns");
- if (!(gfp & __GFP_WAIT)) {
+ if (!gfpflags_allow_blocking(gfp)) {
_leave(" = -EAGAIN");
return -EAGAIN;
}
@@ -593,11 +592,11 @@ found_channel:
conn->channels[chan] = call;
call->conn = conn;
call->channel = chan;
- call->cid = conn->cid | htonl(chan);
- call->call_id = htonl(++conn->call_counter);
+ call->cid = conn->cid | chan;
+ call->call_id = ++conn->call_counter;
_net("CONNECT client on conn %d chan %d as call %x",
- conn->debug_id, chan, ntohl(call->call_id));
+ conn->debug_id, chan, call->call_id);
ASSERTCMP(conn->avail_calls, <, RXRPC_MAXCALLS);
spin_unlock(&trans->client_lock);
@@ -620,21 +619,21 @@ interrupted:
*/
struct rxrpc_connection *
rxrpc_incoming_connection(struct rxrpc_transport *trans,
- struct rxrpc_header *hdr,
+ struct rxrpc_host_header *hdr,
gfp_t gfp)
{
struct rxrpc_connection *conn, *candidate = NULL;
struct rb_node *p, **pp;
const char *new = "old";
__be32 epoch;
- u32 conn_id;
+ u32 cid;
_enter("");
ASSERT(hdr->flags & RXRPC_CLIENT_INITIATED);
epoch = hdr->epoch;
- conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK;
+ cid = hdr->cid & RXRPC_CIDMASK;
/* search the connection list first */
read_lock_bh(&trans->conn_lock);
@@ -643,15 +642,15 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans,
while (p) {
conn = rb_entry(p, struct rxrpc_connection, node);
- _debug("maybe %x", conn->real_conn_id);
+ _debug("maybe %x", conn->cid);
if (epoch < conn->epoch)
p = p->rb_left;
else if (epoch > conn->epoch)
p = p->rb_right;
- else if (conn_id < conn->real_conn_id)
+ else if (cid < conn->cid)
p = p->rb_left;
- else if (conn_id > conn->real_conn_id)
+ else if (cid > conn->cid)
p = p->rb_right;
else
goto found_extant_connection;
@@ -668,12 +667,11 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans,
candidate->trans = trans;
candidate->epoch = hdr->epoch;
- candidate->cid = hdr->cid & cpu_to_be32(RXRPC_CIDMASK);
+ candidate->cid = hdr->cid & RXRPC_CIDMASK;
candidate->service_id = hdr->serviceId;
candidate->security_ix = hdr->securityIndex;
candidate->in_clientflag = RXRPC_CLIENT_INITIATED;
candidate->out_clientflag = 0;
- candidate->real_conn_id = conn_id;
candidate->state = RXRPC_CONN_SERVER;
if (candidate->service_id)
candidate->state = RXRPC_CONN_SERVER_UNSECURED;
@@ -690,9 +688,9 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans,
pp = &(*pp)->rb_left;
else if (epoch > conn->epoch)
pp = &(*pp)->rb_right;
- else if (conn_id < conn->real_conn_id)
+ else if (cid < conn->cid)
pp = &(*pp)->rb_left;
- else if (conn_id > conn->real_conn_id)
+ else if (cid > conn->cid)
pp = &(*pp)->rb_right;
else
goto found_extant_second;
@@ -714,7 +712,7 @@ rxrpc_incoming_connection(struct rxrpc_transport *trans,
new = "new";
success:
- _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->real_conn_id);
+ _net("CONNECTION %s %d {%x}", new, conn->debug_id, conn->cid);
_leave(" = %p {u=%d}", conn, atomic_read(&conn->usage));
return conn;
@@ -751,18 +749,17 @@ security_mismatch:
* packet
*/
struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans,
- struct rxrpc_header *hdr)
+ struct rxrpc_host_header *hdr)
{
struct rxrpc_connection *conn;
struct rb_node *p;
- __be32 epoch;
- u32 conn_id;
+ u32 epoch, cid;
- _enter(",{%x,%x}", ntohl(hdr->cid), hdr->flags);
+ _enter(",{%x,%x}", hdr->cid, hdr->flags);
read_lock_bh(&trans->conn_lock);
- conn_id = ntohl(hdr->cid) & RXRPC_CIDMASK;
+ cid = hdr->cid & RXRPC_CIDMASK;
epoch = hdr->epoch;
if (hdr->flags & RXRPC_CLIENT_INITIATED)
@@ -773,15 +770,15 @@ struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *trans,
while (p) {
conn = rb_entry(p, struct rxrpc_connection, node);
- _debug("maybe %x", conn->real_conn_id);
+ _debug("maybe %x", conn->cid);
if (epoch < conn->epoch)
p = p->rb_left;
else if (epoch > conn->epoch)
p = p->rb_right;
- else if (conn_id < conn->real_conn_id)
+ else if (cid < conn->cid)
p = p->rb_left;
- else if (conn_id > conn->real_conn_id)
+ else if (cid > conn->cid)
p = p->rb_right;
else
goto found;
diff --git a/net/rxrpc/ar-connevent.c b/net/rxrpc/ar-connevent.c
index e7ed43a54c41..1bdaaed8cdc4 100644
--- a/net/rxrpc/ar-connevent.c
+++ b/net/rxrpc/ar-connevent.c
@@ -42,9 +42,9 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state,
call->state = state;
call->abort_code = abort_code;
if (state == RXRPC_CALL_LOCALLY_ABORTED)
- set_bit(RXRPC_CALL_CONN_ABORT, &call->events);
+ set_bit(RXRPC_CALL_EV_CONN_ABORT, &call->events);
else
- set_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+ set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
rxrpc_queue_call(call);
}
write_unlock(&call->state_lock);
@@ -60,11 +60,12 @@ static void rxrpc_abort_calls(struct rxrpc_connection *conn, int state,
static int rxrpc_abort_connection(struct rxrpc_connection *conn,
u32 error, u32 abort_code)
{
- struct rxrpc_header hdr;
+ struct rxrpc_wire_header whdr;
struct msghdr msg;
struct kvec iov[2];
__be32 word;
size_t len;
+ u32 serial;
int ret;
_enter("%d,,%u,%u", conn->debug_id, error, abort_code);
@@ -89,28 +90,29 @@ static int rxrpc_abort_connection(struct rxrpc_connection *conn,
msg.msg_controllen = 0;
msg.msg_flags = 0;
- hdr.epoch = conn->epoch;
- hdr.cid = conn->cid;
- hdr.callNumber = 0;
- hdr.seq = 0;
- hdr.type = RXRPC_PACKET_TYPE_ABORT;
- hdr.flags = conn->out_clientflag;
- hdr.userStatus = 0;
- hdr.securityIndex = conn->security_ix;
- hdr._rsvd = 0;
- hdr.serviceId = conn->service_id;
+ whdr.epoch = htonl(conn->epoch);
+ whdr.cid = htonl(conn->cid);
+ whdr.callNumber = 0;
+ whdr.seq = 0;
+ whdr.type = RXRPC_PACKET_TYPE_ABORT;
+ whdr.flags = conn->out_clientflag;
+ whdr.userStatus = 0;
+ whdr.securityIndex = conn->security_ix;
+ whdr._rsvd = 0;
+ whdr.serviceId = htons(conn->service_id);
word = htonl(abort_code);
- iov[0].iov_base = &hdr;
- iov[0].iov_len = sizeof(hdr);
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
iov[1].iov_base = &word;
iov[1].iov_len = sizeof(word);
len = iov[0].iov_len + iov[1].iov_len;
- hdr.serial = htonl(atomic_inc_return(&conn->serial));
- _proto("Tx CONN ABORT %%%u { %d }", ntohl(hdr.serial), abort_code);
+ serial = atomic_inc_return(&conn->serial);
+ whdr.serial = htonl(serial);
+ _proto("Tx CONN ABORT %%%u { %d }", serial, abort_code);
ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len);
if (ret < 0) {
@@ -132,7 +134,7 @@ static void rxrpc_call_is_secure(struct rxrpc_call *call)
if (call) {
read_lock(&call->state_lock);
if (call->state < RXRPC_CALL_COMPLETE &&
- !test_and_set_bit(RXRPC_CALL_SECURED, &call->events))
+ !test_and_set_bit(RXRPC_CALL_EV_SECURED, &call->events))
rxrpc_queue_call(call);
read_unlock(&call->state_lock);
}
@@ -146,8 +148,8 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
u32 *_abort_code)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- __be32 tmp;
- u32 serial;
+ __be32 wtmp;
+ u32 abort_code;
int loop, ret;
if (conn->state >= RXRPC_CONN_REMOTELY_ABORTED) {
@@ -155,19 +157,18 @@ static int rxrpc_process_event(struct rxrpc_connection *conn,
return -ECONNABORTED;
}
- serial = ntohl(sp->hdr.serial);
-
- _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, serial);
+ _enter("{%d},{%u,%%%u},", conn->debug_id, sp->hdr.type, sp->hdr.serial);
switch (sp->hdr.type) {
case RXRPC_PACKET_TYPE_ABORT:
- if (skb_copy_bits(skb, 0, &tmp, sizeof(tmp)) < 0)
+ if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
return -EPROTO;
- _proto("Rx ABORT %%%u { ac=%d }", serial, ntohl(tmp));
+ abort_code = ntohl(wtmp);
+ _proto("Rx ABORT %%%u { ac=%d }", sp->hdr.serial, abort_code);
conn->state = RXRPC_CONN_REMOTELY_ABORTED;
rxrpc_abort_calls(conn, RXRPC_CALL_REMOTELY_ABORTED,
- ntohl(tmp));
+ abort_code);
return -ECONNABORTED;
case RXRPC_PACKET_TYPE_CHALLENGE:
@@ -335,7 +336,7 @@ void rxrpc_reject_packets(struct work_struct *work)
struct sockaddr_in sin;
} sa;
struct rxrpc_skb_priv *sp;
- struct rxrpc_header hdr;
+ struct rxrpc_wire_header whdr;
struct rxrpc_local *local;
struct sk_buff *skb;
struct msghdr msg;
@@ -348,11 +349,11 @@ void rxrpc_reject_packets(struct work_struct *work)
_enter("%d", local->debug_id);
- iov[0].iov_base = &hdr;
- iov[0].iov_len = sizeof(hdr);
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
iov[1].iov_base = &code;
iov[1].iov_len = sizeof(code);
- size = sizeof(hdr) + sizeof(code);
+ size = sizeof(whdr) + sizeof(code);
msg.msg_name = &sa;
msg.msg_control = NULL;
@@ -370,8 +371,8 @@ void rxrpc_reject_packets(struct work_struct *work)
break;
}
- memset(&hdr, 0, sizeof(hdr));
- hdr.type = RXRPC_PACKET_TYPE_ABORT;
+ memset(&whdr, 0, sizeof(whdr));
+ whdr.type = RXRPC_PACKET_TYPE_ABORT;
while ((skb = skb_dequeue(&local->reject_queue))) {
sp = rxrpc_skb(skb);
@@ -381,13 +382,13 @@ void rxrpc_reject_packets(struct work_struct *work)
sa.sin.sin_addr.s_addr = ip_hdr(skb)->saddr;
code = htonl(skb->priority);
- hdr.epoch = sp->hdr.epoch;
- hdr.cid = sp->hdr.cid;
- hdr.callNumber = sp->hdr.callNumber;
- hdr.serviceId = sp->hdr.serviceId;
- hdr.flags = sp->hdr.flags;
- hdr.flags ^= RXRPC_CLIENT_INITIATED;
- hdr.flags &= RXRPC_CLIENT_INITIATED;
+ whdr.epoch = htonl(sp->hdr.epoch);
+ whdr.cid = htonl(sp->hdr.cid);
+ whdr.callNumber = htonl(sp->hdr.callNumber);
+ whdr.serviceId = htons(sp->hdr.serviceId);
+ whdr.flags = sp->hdr.flags;
+ whdr.flags ^= RXRPC_CLIENT_INITIATED;
+ whdr.flags &= RXRPC_CLIENT_INITIATED;
kernel_sendmsg(local->socket, &msg, iov, 2, size);
break;
diff --git a/net/rxrpc/ar-error.c b/net/rxrpc/ar-error.c
index 0610efa83d72..3e82d6f0313c 100644
--- a/net/rxrpc/ar-error.c
+++ b/net/rxrpc/ar-error.c
@@ -115,7 +115,6 @@ void rxrpc_UDP_error_report(struct sock *sk)
/* pass the transport ref to error_handler to release */
skb_queue_tail(&trans->error_queue, skb);
rxrpc_queue_work(&trans->error_handler);
-
_leave("");
}
@@ -152,28 +151,18 @@ void rxrpc_UDP_error_handler(struct work_struct *work)
switch (ee->ee_code) {
case ICMP_NET_UNREACH:
_net("Rx Received ICMP Network Unreachable");
- err = ENETUNREACH;
break;
case ICMP_HOST_UNREACH:
_net("Rx Received ICMP Host Unreachable");
- err = EHOSTUNREACH;
break;
case ICMP_PORT_UNREACH:
_net("Rx Received ICMP Port Unreachable");
- err = ECONNREFUSED;
- break;
- case ICMP_FRAG_NEEDED:
- _net("Rx Received ICMP Fragmentation Needed (%d)",
- ee->ee_info);
- err = 0; /* dealt with elsewhere */
break;
case ICMP_NET_UNKNOWN:
_net("Rx Received ICMP Unknown Network");
- err = ENETUNREACH;
break;
case ICMP_HOST_UNKNOWN:
_net("Rx Received ICMP Unknown Host");
- err = EHOSTUNREACH;
break;
default:
_net("Rx Received ICMP DestUnreach code=%u",
@@ -222,7 +211,7 @@ void rxrpc_UDP_error_handler(struct work_struct *work)
if (call->state != RXRPC_CALL_COMPLETE &&
call->state < RXRPC_CALL_NETWORK_ERROR) {
call->state = RXRPC_CALL_NETWORK_ERROR;
- set_bit(RXRPC_CALL_RCVD_ERROR, &call->events);
+ set_bit(RXRPC_CALL_EV_RCVD_ERROR, &call->events);
rxrpc_queue_call(call);
}
write_unlock(&call->state_lock);
diff --git a/net/rxrpc/ar-input.c b/net/rxrpc/ar-input.c
index 4505a691d88c..63ed75c40e29 100644
--- a/net/rxrpc/ar-input.c
+++ b/net/rxrpc/ar-input.c
@@ -231,7 +231,7 @@ static int rxrpc_fast_process_data(struct rxrpc_call *call,
_debug("drain rx oos now");
read_lock(&call->state_lock);
if (call->state < RXRPC_CALL_COMPLETE &&
- !test_and_set_bit(RXRPC_CALL_DRAIN_RX_OOS, &call->events))
+ !test_and_set_bit(RXRPC_CALL_EV_DRAIN_RX_OOS, &call->events))
rxrpc_queue_call(call);
read_unlock(&call->state_lock);
}
@@ -287,12 +287,12 @@ static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial)
call->acks_latest = serial;
_debug("implicit ACKALL %%%u", call->acks_latest);
- set_bit(RXRPC_CALL_RCVD_ACKALL, &call->events);
+ set_bit(RXRPC_CALL_EV_RCVD_ACKALL, &call->events);
write_unlock_bh(&call->state_lock);
if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
- clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
- clear_bit(RXRPC_CALL_RESEND, &call->events);
+ clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
+ clear_bit(RXRPC_CALL_EV_RESEND, &call->events);
clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
}
break;
@@ -310,8 +310,8 @@ static void rxrpc_assume_implicit_ackall(struct rxrpc_call *call, u32 serial)
void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
{
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- __be32 _abort_code;
- u32 serial, hi_serial, seq, abort_code;
+ __be32 wtmp;
+ u32 hi_serial, abort_code;
_enter("%p,%p", call, skb);
@@ -330,16 +330,15 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
/* track the latest serial number on this connection for ACK packet
* information */
- serial = ntohl(sp->hdr.serial);
hi_serial = atomic_read(&call->conn->hi_serial);
- while (serial > hi_serial)
+ while (sp->hdr.serial > hi_serial)
hi_serial = atomic_cmpxchg(&call->conn->hi_serial, hi_serial,
- serial);
+ sp->hdr.serial);
/* request ACK generation for any ACK or DATA packet that requests
* it */
if (sp->hdr.flags & RXRPC_REQUEST_ACK) {
- _proto("ACK Requested on %%%u", serial);
+ _proto("ACK Requested on %%%u", sp->hdr.serial);
rxrpc_propose_ACK(call, RXRPC_ACK_REQUESTED, sp->hdr.serial, false);
}
@@ -347,24 +346,23 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
case RXRPC_PACKET_TYPE_ABORT:
_debug("abort");
- if (skb_copy_bits(skb, 0, &_abort_code,
- sizeof(_abort_code)) < 0)
+ if (skb_copy_bits(skb, 0, &wtmp, sizeof(wtmp)) < 0)
goto protocol_error;
- abort_code = ntohl(_abort_code);
- _proto("Rx ABORT %%%u { %x }", serial, abort_code);
+ abort_code = ntohl(wtmp);
+ _proto("Rx ABORT %%%u { %x }", sp->hdr.serial, abort_code);
write_lock_bh(&call->state_lock);
if (call->state < RXRPC_CALL_COMPLETE) {
call->state = RXRPC_CALL_REMOTELY_ABORTED;
call->abort_code = abort_code;
- set_bit(RXRPC_CALL_RCVD_ABORT, &call->events);
+ set_bit(RXRPC_CALL_EV_RCVD_ABORT, &call->events);
rxrpc_queue_call(call);
}
goto free_packet_unlock;
case RXRPC_PACKET_TYPE_BUSY:
- _proto("Rx BUSY %%%u", serial);
+ _proto("Rx BUSY %%%u", sp->hdr.serial);
if (call->conn->out_clientflag)
goto protocol_error;
@@ -373,7 +371,7 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
switch (call->state) {
case RXRPC_CALL_CLIENT_SEND_REQUEST:
call->state = RXRPC_CALL_SERVER_BUSY;
- set_bit(RXRPC_CALL_RCVD_BUSY, &call->events);
+ set_bit(RXRPC_CALL_EV_RCVD_BUSY, &call->events);
rxrpc_queue_call(call);
case RXRPC_CALL_SERVER_BUSY:
goto free_packet_unlock;
@@ -382,15 +380,13 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
}
default:
- _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], serial);
+ _proto("Rx %s %%%u", rxrpc_pkts[sp->hdr.type], sp->hdr.serial);
goto protocol_error;
case RXRPC_PACKET_TYPE_DATA:
- seq = ntohl(sp->hdr.seq);
+ _proto("Rx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
- _proto("Rx DATA %%%u { #%u }", serial, seq);
-
- if (seq == 0)
+ if (sp->hdr.seq == 0)
goto protocol_error;
call->ackr_prev_seq = sp->hdr.seq;
@@ -398,9 +394,9 @@ void rxrpc_fast_process_packet(struct rxrpc_call *call, struct sk_buff *skb)
/* received data implicitly ACKs all of the request packets we
* sent when we're acting as a client */
if (call->state == RXRPC_CALL_CLIENT_AWAIT_REPLY)
- rxrpc_assume_implicit_ackall(call, serial);
+ rxrpc_assume_implicit_ackall(call, sp->hdr.serial);
- switch (rxrpc_fast_process_data(call, skb, seq)) {
+ switch (rxrpc_fast_process_data(call, skb, sp->hdr.seq)) {
case 0:
skb = NULL;
goto done;
@@ -433,7 +429,7 @@ protocol_error_locked:
if (call->state <= RXRPC_CALL_COMPLETE) {
call->state = RXRPC_CALL_LOCALLY_ABORTED;
call->abort_code = RX_PROTOCOL_ERROR;
- set_bit(RXRPC_CALL_ABORT, &call->events);
+ set_bit(RXRPC_CALL_EV_ABORT, &call->events);
rxrpc_queue_call(call);
}
free_packet_unlock:
@@ -481,12 +477,12 @@ static void rxrpc_process_jumbo_packet(struct rxrpc_call *call,
if (!pskb_pull(jumbo, sizeof(jhdr)))
BUG();
- sp->hdr.seq = htonl(ntohl(sp->hdr.seq) + 1);
- sp->hdr.serial = htonl(ntohl(sp->hdr.serial) + 1);
+ sp->hdr.seq += 1;
+ sp->hdr.serial += 1;
sp->hdr.flags = jhdr.flags;
sp->hdr._rsvd = jhdr._rsvd;
- _proto("Rx DATA Jumbo %%%u", ntohl(sp->hdr.serial) - 1);
+ _proto("Rx DATA Jumbo %%%u", sp->hdr.serial - 1);
rxrpc_fast_process_packet(call, part);
part = NULL;
@@ -505,7 +501,7 @@ protocol_error:
if (call->state <= RXRPC_CALL_COMPLETE) {
call->state = RXRPC_CALL_LOCALLY_ABORTED;
call->abort_code = RX_PROTOCOL_ERROR;
- set_bit(RXRPC_CALL_ABORT, &call->events);
+ set_bit(RXRPC_CALL_EV_ABORT, &call->events);
rxrpc_queue_call(call);
}
write_unlock_bh(&call->state_lock);
@@ -530,7 +526,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call,
read_lock(&call->state_lock);
switch (call->state) {
case RXRPC_CALL_LOCALLY_ABORTED:
- if (!test_and_set_bit(RXRPC_CALL_ABORT, &call->events)) {
+ if (!test_and_set_bit(RXRPC_CALL_EV_ABORT, &call->events)) {
rxrpc_queue_call(call);
goto free_unlock;
}
@@ -546,7 +542,7 @@ static void rxrpc_post_packet_to_call(struct rxrpc_call *call,
/* resend last packet of a completed call */
_debug("final ack again");
rxrpc_get_call(call);
- set_bit(RXRPC_CALL_ACK_FINAL, &call->events);
+ set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
rxrpc_queue_call(call);
goto free_unlock;
default:
@@ -607,6 +603,35 @@ static void rxrpc_post_packet_to_local(struct rxrpc_local *local,
rxrpc_queue_work(&local->event_processor);
}
+/*
+ * Extract the wire header from a packet and translate the byte order.
+ */
+static noinline
+int rxrpc_extract_header(struct rxrpc_skb_priv *sp, struct sk_buff *skb)
+{
+ struct rxrpc_wire_header whdr;
+
+ /* dig out the RxRPC connection details */
+ if (skb_copy_bits(skb, sizeof(struct udphdr), &whdr, sizeof(whdr)) < 0)
+ return -EBADMSG;
+ if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(whdr)))
+ BUG();
+
+ memset(sp, 0, sizeof(*sp));
+ sp->hdr.epoch = ntohl(whdr.epoch);
+ sp->hdr.cid = ntohl(whdr.cid);
+ sp->hdr.callNumber = ntohl(whdr.callNumber);
+ sp->hdr.seq = ntohl(whdr.seq);
+ sp->hdr.serial = ntohl(whdr.serial);
+ sp->hdr.flags = whdr.flags;
+ sp->hdr.type = whdr.type;
+ sp->hdr.userStatus = whdr.userStatus;
+ sp->hdr.securityIndex = whdr.securityIndex;
+ sp->hdr._rsvd = ntohs(whdr._rsvd);
+ sp->hdr.serviceId = ntohs(whdr.serviceId);
+ return 0;
+}
+
static struct rxrpc_connection *rxrpc_conn_from_local(struct rxrpc_local *local,
struct sk_buff *skb,
struct rxrpc_skb_priv *sp)
@@ -686,29 +711,25 @@ void rxrpc_data_ready(struct sock *sk)
UDP_INC_STATS_BH(&init_net, UDP_MIB_INDATAGRAMS, 0);
- /* the socket buffer we have is owned by UDP, with UDP's data all over
- * it, but we really want our own */
+ /* The socket buffer we have is owned by UDP, with UDP's data all over
+ * it, but we really want our own data there.
+ */
skb_orphan(skb);
sp = rxrpc_skb(skb);
- memset(sp, 0, sizeof(*sp));
_net("Rx UDP packet from %08x:%04hu",
ntohl(ip_hdr(skb)->saddr), ntohs(udp_hdr(skb)->source));
/* dig out the RxRPC connection details */
- if (skb_copy_bits(skb, sizeof(struct udphdr), &sp->hdr,
- sizeof(sp->hdr)) < 0)
+ if (rxrpc_extract_header(sp, skb) < 0)
goto bad_message;
- if (!pskb_pull(skb, sizeof(struct udphdr) + sizeof(sp->hdr)))
- BUG();
_net("Rx RxRPC %s ep=%x call=%x:%x",
sp->hdr.flags & RXRPC_CLIENT_INITIATED ? "ToServer" : "ToClient",
- ntohl(sp->hdr.epoch),
- ntohl(sp->hdr.cid),
- ntohl(sp->hdr.callNumber));
+ sp->hdr.epoch, sp->hdr.cid, sp->hdr.callNumber);
- if (sp->hdr.type == 0 || sp->hdr.type >= RXRPC_N_PACKET_TYPES) {
+ if (sp->hdr.type >= RXRPC_N_PACKET_TYPES ||
+ !((RXRPC_SUPPORTED_PACKET_TYPES >> sp->hdr.type) & 1)) {
_proto("Rx Bad Packet Type %u", sp->hdr.type);
goto bad_message;
}
@@ -737,14 +758,9 @@ void rxrpc_data_ready(struct sock *sk)
rxrpc_put_connection(conn);
} else {
struct rxrpc_call *call;
- u8 in_clientflag = 0;
-
- if (sp->hdr.flags & RXRPC_CLIENT_INITIATED)
- in_clientflag = RXRPC_CLIENT_INITIATED;
- call = rxrpc_find_call_hash(in_clientflag, sp->hdr.cid,
- sp->hdr.callNumber, sp->hdr.epoch,
- sp->hdr.serviceId, local, AF_INET,
- (u8 *)&ip_hdr(skb)->saddr);
+
+ call = rxrpc_find_call_hash(&sp->hdr, local,
+ AF_INET, &ip_hdr(skb)->saddr);
if (call)
rxrpc_post_packet_to_call(call, skb);
else
@@ -759,7 +775,7 @@ cant_route_call:
_debug("can't route call");
if (sp->hdr.flags & RXRPC_CLIENT_INITIATED &&
sp->hdr.type == RXRPC_PACKET_TYPE_DATA) {
- if (sp->hdr.seq == cpu_to_be32(1)) {
+ if (sp->hdr.seq == 1) {
_debug("first packet");
skb_queue_tail(&local->accept_queue, skb);
rxrpc_queue_work(&local->acceptor);
diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
index 2934a73a5981..cd6cdbe87125 100644
--- a/net/rxrpc/ar-internal.h
+++ b/net/rxrpc/ar-internal.h
@@ -16,7 +16,7 @@
BUG_ON(atomic_read((X)) >> (sizeof(atomic_t) - 2) == \
(POISON_FREE << 8 | POISON_FREE))
#else
-#define CHECK_SLAB_OKAY(X) do {} while(0)
+#define CHECK_SLAB_OKAY(X) do {} while (0)
#endif
#define FCRYPT_BSIZE 8
@@ -70,12 +70,31 @@ struct rxrpc_sock {
#define RXRPC_SECURITY_MAX RXRPC_SECURITY_ENCRYPT
struct sockaddr_rxrpc srx; /* local address */
sa_family_t proto; /* protocol created with */
- __be16 service_id; /* service ID of local/remote service */
};
#define rxrpc_sk(__sk) container_of((__sk), struct rxrpc_sock, sk)
/*
+ * CPU-byteorder normalised Rx packet header.
+ */
+struct rxrpc_host_header {
+ u32 epoch; /* client boot timestamp */
+ u32 cid; /* connection and channel ID */
+ u32 callNumber; /* call ID (0 for connection-level packets) */
+ u32 seq; /* sequence number of pkt in call stream */
+ u32 serial; /* serial number of pkt sent to network */
+ u8 type; /* packet type */
+ u8 flags; /* packet flags */
+ u8 userStatus; /* app-layer defined status */
+ u8 securityIndex; /* security protocol ID */
+ union {
+ u16 _rsvd; /* reserved */
+ u16 cksum; /* kerberos security checksum */
+ };
+ u16 serviceId; /* service ID */
+} __packed;
+
+/*
* RxRPC socket buffer private variables
* - max 48 bytes (struct sk_buff::cb)
*/
@@ -89,7 +108,7 @@ struct rxrpc_skb_priv {
bool need_resend; /* T if needs resending */
};
- struct rxrpc_header hdr; /* RxRPC packet header from this packet */
+ struct rxrpc_host_header hdr; /* RxRPC packet header from this packet */
};
#define rxrpc_skb(__skb) ((struct rxrpc_skb_priv *) &(__skb)->cb)
@@ -230,7 +249,7 @@ struct rxrpc_conn_bundle {
atomic_t usage;
int debug_id; /* debug ID for printks */
unsigned short num_conns; /* number of connections in this bundle */
- __be16 service_id; /* service ID */
+ u16 service_id; /* Service ID for this bundle */
u8 security_ix; /* security type */
};
@@ -252,7 +271,7 @@ struct rxrpc_connection {
struct rxrpc_security *security; /* applied security module */
struct key *key; /* security for this connection (client) */
struct key *server_key; /* security for this service */
- struct crypto_blkcipher *cipher; /* encryption handle */
+ struct crypto_skcipher *cipher; /* encryption handle */
struct rxrpc_crypt csum_iv; /* packet checksum base */
unsigned long events;
#define RXRPC_CONN_CHALLENGE 0 /* send challenge packet */
@@ -260,7 +279,6 @@ struct rxrpc_connection {
rwlock_t lock; /* access lock */
spinlock_t state_lock; /* state-change lock */
atomic_t usage;
- u32 real_conn_id; /* connection ID (host-endian) */
enum { /* current state of connection */
RXRPC_CONN_UNUSED, /* - connection not yet attempted */
RXRPC_CONN_CLIENT, /* - client connection */
@@ -282,17 +300,76 @@ struct rxrpc_connection {
u8 security_size; /* security header size */
u32 security_level; /* security level negotiated */
u32 security_nonce; /* response re-use preventer */
-
- /* the following are all in net order */
- __be32 epoch; /* epoch of this connection */
- __be32 cid; /* connection ID */
- __be16 service_id; /* service ID */
+ u32 epoch; /* epoch of this connection */
+ u32 cid; /* connection ID */
+ u16 service_id; /* service ID for this connection */
u8 security_ix; /* security type */
u8 in_clientflag; /* RXRPC_CLIENT_INITIATED if we are server */
u8 out_clientflag; /* RXRPC_CLIENT_INITIATED if we are client */
};
/*
+ * Flags in call->flags.
+ */
+enum rxrpc_call_flag {
+ RXRPC_CALL_RELEASED, /* call has been released - no more message to userspace */
+ RXRPC_CALL_TERMINAL_MSG, /* call has given the socket its final message */
+ RXRPC_CALL_RCVD_LAST, /* all packets received */
+ RXRPC_CALL_RUN_RTIMER, /* Tx resend timer started */
+ RXRPC_CALL_TX_SOFT_ACK, /* sent some soft ACKs */
+ RXRPC_CALL_PROC_BUSY, /* the processor is busy */
+ RXRPC_CALL_INIT_ACCEPT, /* acceptance was initiated */
+ RXRPC_CALL_HAS_USERID, /* has a user ID attached */
+ RXRPC_CALL_EXPECT_OOS, /* expect out of sequence packets */
+};
+
+/*
+ * Events that can be raised on a call.
+ */
+enum rxrpc_call_event {
+ RXRPC_CALL_EV_RCVD_ACKALL, /* ACKALL or reply received */
+ RXRPC_CALL_EV_RCVD_BUSY, /* busy packet received */
+ RXRPC_CALL_EV_RCVD_ABORT, /* abort packet received */
+ RXRPC_CALL_EV_RCVD_ERROR, /* network error received */
+ RXRPC_CALL_EV_ACK_FINAL, /* need to generate final ACK (and release call) */
+ RXRPC_CALL_EV_ACK, /* need to generate ACK */
+ RXRPC_CALL_EV_REJECT_BUSY, /* need to generate busy message */
+ RXRPC_CALL_EV_ABORT, /* need to generate abort */
+ RXRPC_CALL_EV_CONN_ABORT, /* local connection abort generated */
+ RXRPC_CALL_EV_RESEND_TIMER, /* Tx resend timer expired */
+ RXRPC_CALL_EV_RESEND, /* Tx resend required */
+ RXRPC_CALL_EV_DRAIN_RX_OOS, /* drain the Rx out of sequence queue */
+ RXRPC_CALL_EV_LIFE_TIMER, /* call's lifetimer ran out */
+ RXRPC_CALL_EV_ACCEPTED, /* incoming call accepted by userspace app */
+ RXRPC_CALL_EV_SECURED, /* incoming call's connection is now secure */
+ RXRPC_CALL_EV_POST_ACCEPT, /* need to post an "accept?" message to the app */
+ RXRPC_CALL_EV_RELEASE, /* need to release the call's resources */
+};
+
+/*
+ * The states that a call can be in.
+ */
+enum rxrpc_call_state {
+ RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */
+ RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */
+ RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */
+ RXRPC_CALL_CLIENT_FINAL_ACK, /* - client sending final ACK phase */
+ RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */
+ RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */
+ RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */
+ RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */
+ RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */
+ RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */
+ RXRPC_CALL_COMPLETE, /* - call completed */
+ RXRPC_CALL_SERVER_BUSY, /* - call rejected by busy server */
+ RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */
+ RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */
+ RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */
+ RXRPC_CALL_DEAD, /* - call is dead */
+ NR__RXRPC_CALL_STATES
+};
+
+/*
* RxRPC call definition
* - matched by { connection, call_id }
*/
@@ -317,57 +394,13 @@ struct rxrpc_call {
unsigned long user_call_ID; /* user-defined call ID */
unsigned long creation_jif; /* time of call creation */
unsigned long flags;
-#define RXRPC_CALL_RELEASED 0 /* call has been released - no more message to userspace */
-#define RXRPC_CALL_TERMINAL_MSG 1 /* call has given the socket its final message */
-#define RXRPC_CALL_RCVD_LAST 2 /* all packets received */
-#define RXRPC_CALL_RUN_RTIMER 3 /* Tx resend timer started */
-#define RXRPC_CALL_TX_SOFT_ACK 4 /* sent some soft ACKs */
-#define RXRPC_CALL_PROC_BUSY 5 /* the processor is busy */
-#define RXRPC_CALL_INIT_ACCEPT 6 /* acceptance was initiated */
-#define RXRPC_CALL_HAS_USERID 7 /* has a user ID attached */
-#define RXRPC_CALL_EXPECT_OOS 8 /* expect out of sequence packets */
unsigned long events;
-#define RXRPC_CALL_RCVD_ACKALL 0 /* ACKALL or reply received */
-#define RXRPC_CALL_RCVD_BUSY 1 /* busy packet received */
-#define RXRPC_CALL_RCVD_ABORT 2 /* abort packet received */
-#define RXRPC_CALL_RCVD_ERROR 3 /* network error received */
-#define RXRPC_CALL_ACK_FINAL 4 /* need to generate final ACK (and release call) */
-#define RXRPC_CALL_ACK 5 /* need to generate ACK */
-#define RXRPC_CALL_REJECT_BUSY 6 /* need to generate busy message */
-#define RXRPC_CALL_ABORT 7 /* need to generate abort */
-#define RXRPC_CALL_CONN_ABORT 8 /* local connection abort generated */
-#define RXRPC_CALL_RESEND_TIMER 9 /* Tx resend timer expired */
-#define RXRPC_CALL_RESEND 10 /* Tx resend required */
-#define RXRPC_CALL_DRAIN_RX_OOS 11 /* drain the Rx out of sequence queue */
-#define RXRPC_CALL_LIFE_TIMER 12 /* call's lifetimer ran out */
-#define RXRPC_CALL_ACCEPTED 13 /* incoming call accepted by userspace app */
-#define RXRPC_CALL_SECURED 14 /* incoming call's connection is now secure */
-#define RXRPC_CALL_POST_ACCEPT 15 /* need to post an "accept?" message to the app */
-#define RXRPC_CALL_RELEASE 16 /* need to release the call's resources */
-
spinlock_t lock;
rwlock_t state_lock; /* lock for state transition */
atomic_t usage;
atomic_t sequence; /* Tx data packet sequence counter */
u32 abort_code; /* local/remote abort code */
- enum { /* current state of call */
- RXRPC_CALL_CLIENT_SEND_REQUEST, /* - client sending request phase */
- RXRPC_CALL_CLIENT_AWAIT_REPLY, /* - client awaiting reply */
- RXRPC_CALL_CLIENT_RECV_REPLY, /* - client receiving reply phase */
- RXRPC_CALL_CLIENT_FINAL_ACK, /* - client sending final ACK phase */
- RXRPC_CALL_SERVER_SECURING, /* - server securing request connection */
- RXRPC_CALL_SERVER_ACCEPTING, /* - server accepting request */
- RXRPC_CALL_SERVER_RECV_REQUEST, /* - server receiving request */
- RXRPC_CALL_SERVER_ACK_REQUEST, /* - server pending ACK of request */
- RXRPC_CALL_SERVER_SEND_REPLY, /* - server sending reply */
- RXRPC_CALL_SERVER_AWAIT_ACK, /* - server awaiting final ACK */
- RXRPC_CALL_COMPLETE, /* - call completed */
- RXRPC_CALL_SERVER_BUSY, /* - call rejected by busy server */
- RXRPC_CALL_REMOTELY_ABORTED, /* - call aborted by peer */
- RXRPC_CALL_LOCALLY_ABORTED, /* - call aborted locally on error or close */
- RXRPC_CALL_NETWORK_ERROR, /* - call terminated by network error */
- RXRPC_CALL_DEAD, /* - call is dead */
- } state;
+ enum rxrpc_call_state state : 8; /* current state of call */
int debug_id; /* debug ID for printks */
u8 channel; /* connection channel occupied by this call */
@@ -389,9 +422,9 @@ struct rxrpc_call {
rxrpc_seq_t rx_data_eaten; /* last data seq ID consumed by recvmsg */
rxrpc_seq_t rx_first_oos; /* first packet in rx_oos_queue (or 0) */
rxrpc_seq_t ackr_win_top; /* top of ACK window (rx_data_eaten is bottom) */
- rxrpc_seq_net_t ackr_prev_seq; /* previous sequence number received */
+ rxrpc_seq_t ackr_prev_seq; /* previous sequence number received */
u8 ackr_reason; /* reason to ACK */
- __be32 ackr_serial; /* serial of packet being ACK'd */
+ rxrpc_serial_t ackr_serial; /* serial of packet being ACK'd */
atomic_t ackr_not_idle; /* number of packets in Rx queue */
/* received packet records, 1 bit per record */
@@ -403,11 +436,10 @@ struct rxrpc_call {
u8 in_clientflag; /* Copy of conn->in_clientflag for hashing */
struct rxrpc_local *local; /* Local endpoint. Used for hashing. */
sa_family_t proto; /* Frame protocol */
- /* the following should all be in net order */
- __be32 cid; /* connection ID + channel index */
- __be32 call_id; /* call ID on connection */
- __be32 epoch; /* epoch of this connection */
- __be16 service_id; /* service ID */
+ u32 call_id; /* call ID on connection */
+ u32 cid; /* connection ID plus channel index */
+ u32 epoch; /* epoch of this connection */
+ u16 service_id; /* service ID */
union { /* Peer IP address for hashing */
__be32 ipv4_addr;
__u8 ipv6_addr[16]; /* Anticipates eventual IPv6 support */
@@ -423,7 +455,7 @@ static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code)
if (call->state < RXRPC_CALL_COMPLETE) {
call->abort_code = abort_code;
call->state = RXRPC_CALL_LOCALLY_ABORTED;
- set_bit(RXRPC_CALL_ABORT, &call->events);
+ set_bit(RXRPC_CALL_EV_ABORT, &call->events);
}
write_unlock_bh(&call->state_lock);
}
@@ -432,7 +464,7 @@ static inline void rxrpc_abort_call(struct rxrpc_call *call, u32 abort_code)
* af_rxrpc.c
*/
extern atomic_t rxrpc_n_skbs;
-extern __be32 rxrpc_epoch;
+extern u32 rxrpc_epoch;
extern atomic_t rxrpc_debug_id;
extern struct workqueue_struct *rxrpc_workqueue;
@@ -446,35 +478,35 @@ int rxrpc_reject_call(struct rxrpc_sock *);
/*
* ar-ack.c
*/
-extern unsigned rxrpc_requested_ack_delay;
-extern unsigned rxrpc_soft_ack_delay;
-extern unsigned rxrpc_idle_ack_delay;
-extern unsigned rxrpc_rx_window_size;
-extern unsigned rxrpc_rx_mtu;
-extern unsigned rxrpc_rx_jumbo_max;
+extern unsigned int rxrpc_requested_ack_delay;
+extern unsigned int rxrpc_soft_ack_delay;
+extern unsigned int rxrpc_idle_ack_delay;
+extern unsigned int rxrpc_rx_window_size;
+extern unsigned int rxrpc_rx_mtu;
+extern unsigned int rxrpc_rx_jumbo_max;
-void __rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool);
-void rxrpc_propose_ACK(struct rxrpc_call *, u8, __be32, bool);
+void __rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
+void rxrpc_propose_ACK(struct rxrpc_call *, u8, u32, bool);
void rxrpc_process_call(struct work_struct *);
/*
* ar-call.c
*/
-extern unsigned rxrpc_max_call_lifetime;
-extern unsigned rxrpc_dead_call_expiry;
+extern unsigned int rxrpc_max_call_lifetime;
+extern unsigned int rxrpc_dead_call_expiry;
extern struct kmem_cache *rxrpc_call_jar;
extern struct list_head rxrpc_calls;
extern rwlock_t rxrpc_call_lock;
-struct rxrpc_call *rxrpc_find_call_hash(u8, __be32, __be32, __be32,
- __be16, void *, sa_family_t, const u8 *);
+struct rxrpc_call *rxrpc_find_call_hash(struct rxrpc_host_header *,
+ void *, sa_family_t, const void *);
struct rxrpc_call *rxrpc_get_client_call(struct rxrpc_sock *,
struct rxrpc_transport *,
struct rxrpc_conn_bundle *,
unsigned long, int, gfp_t);
struct rxrpc_call *rxrpc_incoming_call(struct rxrpc_sock *,
struct rxrpc_connection *,
- struct rxrpc_header *, gfp_t);
+ struct rxrpc_host_header *, gfp_t);
struct rxrpc_call *rxrpc_find_server_call(struct rxrpc_sock *, unsigned long);
void rxrpc_release_call(struct rxrpc_call *);
void rxrpc_release_calls_on_socket(struct rxrpc_sock *);
@@ -484,22 +516,22 @@ void __exit rxrpc_destroy_all_calls(void);
/*
* ar-connection.c
*/
-extern unsigned rxrpc_connection_expiry;
+extern unsigned int rxrpc_connection_expiry;
extern struct list_head rxrpc_connections;
extern rwlock_t rxrpc_connection_lock;
struct rxrpc_conn_bundle *rxrpc_get_bundle(struct rxrpc_sock *,
struct rxrpc_transport *,
- struct key *, __be16, gfp_t);
+ struct key *, u16, gfp_t);
void rxrpc_put_bundle(struct rxrpc_transport *, struct rxrpc_conn_bundle *);
int rxrpc_connect_call(struct rxrpc_sock *, struct rxrpc_transport *,
struct rxrpc_conn_bundle *, struct rxrpc_call *, gfp_t);
void rxrpc_put_connection(struct rxrpc_connection *);
void __exit rxrpc_destroy_all_connections(void);
struct rxrpc_connection *rxrpc_find_connection(struct rxrpc_transport *,
- struct rxrpc_header *);
+ struct rxrpc_host_header *);
extern struct rxrpc_connection *
-rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_header *,
+rxrpc_incoming_connection(struct rxrpc_transport *, struct rxrpc_host_header *,
gfp_t);
/*
@@ -547,7 +579,7 @@ int rxrpc_get_server_data_key(struct rxrpc_connection *, const void *, time_t,
/*
* ar-output.c
*/
-extern unsigned rxrpc_resend_timeout;
+extern unsigned int rxrpc_resend_timeout;
int rxrpc_send_packet(struct rxrpc_transport *, struct sk_buff *);
int rxrpc_client_sendmsg(struct rxrpc_sock *, struct rxrpc_transport *,
@@ -595,7 +627,7 @@ void rxrpc_packet_destructor(struct sk_buff *);
/*
* ar-transport.c
*/
-extern unsigned rxrpc_transport_expiry;
+extern unsigned int rxrpc_transport_expiry;
struct rxrpc_transport *rxrpc_get_transport(struct rxrpc_local *,
struct rxrpc_peer *, gfp_t);
@@ -694,7 +726,7 @@ do { \
printk(KERN_ERR "RxRPC: Assertion failed\n"); \
BUG(); \
} \
-} while(0)
+} while (0)
#define ASSERTCMP(X, OP, Y) \
do { \
@@ -707,7 +739,7 @@ do { \
(unsigned long)(X), (unsigned long)(Y)); \
BUG(); \
} \
-} while(0)
+} while (0)
#define ASSERTIF(C, X) \
do { \
@@ -716,7 +748,7 @@ do { \
printk(KERN_ERR "RxRPC: Assertion failed\n"); \
BUG(); \
} \
-} while(0)
+} while (0)
#define ASSERTIFCMP(C, X, OP, Y) \
do { \
@@ -729,25 +761,25 @@ do { \
(unsigned long)(X), (unsigned long)(Y)); \
BUG(); \
} \
-} while(0)
+} while (0)
#else
#define ASSERT(X) \
do { \
-} while(0)
+} while (0)
#define ASSERTCMP(X, OP, Y) \
do { \
-} while(0)
+} while (0)
#define ASSERTIF(C, X) \
do { \
-} while(0)
+} while (0)
#define ASSERTIFCMP(C, X, OP, Y) \
do { \
-} while(0)
+} while (0)
#endif /* __KDEBUGALL */
@@ -804,9 +836,9 @@ do { \
CHECK_SLAB_OKAY(&(CALL)->usage); \
if (atomic_inc_return(&(CALL)->usage) == 1) \
BUG(); \
-} while(0)
+} while (0)
#define rxrpc_put_call(CALL) \
do { \
__rxrpc_put_call(CALL); \
-} while(0)
+} while (0)
diff --git a/net/rxrpc/ar-key.c b/net/rxrpc/ar-key.c
index da3cc09f683e..3fb492eedeb9 100644
--- a/net/rxrpc/ar-key.c
+++ b/net/rxrpc/ar-key.c
@@ -12,11 +12,11 @@
* "afs@CAMBRIDGE.REDHAT.COM>
*/
+#include <crypto/skcipher.h>
#include <linux/module.h>
#include <linux/net.h>
#include <linux/skbuff.h>
#include <linux/key-type.h>
-#include <linux/crypto.h>
#include <linux/ctype.h>
#include <linux/slab.h>
#include <net/sock.h>
@@ -824,7 +824,7 @@ static void rxrpc_free_preparse(struct key_preparsed_payload *prep)
*/
static int rxrpc_preparse_s(struct key_preparsed_payload *prep)
{
- struct crypto_blkcipher *ci;
+ struct crypto_skcipher *ci;
_enter("%zu", prep->datalen);
@@ -833,13 +833,13 @@ static int rxrpc_preparse_s(struct key_preparsed_payload *prep)
memcpy(&prep->payload.data[2], prep->data, 8);
- ci = crypto_alloc_blkcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC);
+ ci = crypto_alloc_skcipher("pcbc(des)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(ci)) {
_leave(" = %ld", PTR_ERR(ci));
return PTR_ERR(ci);
}
- if (crypto_blkcipher_setkey(ci, prep->data, 8) < 0)
+ if (crypto_skcipher_setkey(ci, prep->data, 8) < 0)
BUG();
prep->payload.data[0] = ci;
@@ -853,7 +853,7 @@ static int rxrpc_preparse_s(struct key_preparsed_payload *prep)
static void rxrpc_free_preparse_s(struct key_preparsed_payload *prep)
{
if (prep->payload.data[0])
- crypto_free_blkcipher(prep->payload.data[0]);
+ crypto_free_skcipher(prep->payload.data[0]);
}
/*
@@ -870,7 +870,7 @@ static void rxrpc_destroy(struct key *key)
static void rxrpc_destroy_s(struct key *key)
{
if (key->payload.data[0]) {
- crypto_free_blkcipher(key->payload.data[0]);
+ crypto_free_skcipher(key->payload.data[0]);
key->payload.data[0] = NULL;
}
}
@@ -896,15 +896,9 @@ int rxrpc_request_key(struct rxrpc_sock *rx, char __user *optval, int optlen)
if (optlen <= 0 || optlen > PAGE_SIZE - 1)
return -EINVAL;
- description = kmalloc(optlen + 1, GFP_KERNEL);
- if (!description)
- return -ENOMEM;
-
- if (copy_from_user(description, optval, optlen)) {
- kfree(description);
- return -EFAULT;
- }
- description[optlen] = 0;
+ description = memdup_user_nul(optval, optlen);
+ if (IS_ERR(description))
+ return PTR_ERR(description);
key = request_key(&key_type_rxrpc, description, NULL);
if (IS_ERR(key)) {
@@ -933,15 +927,9 @@ int rxrpc_server_keyring(struct rxrpc_sock *rx, char __user *optval,
if (optlen <= 0 || optlen > PAGE_SIZE - 1)
return -EINVAL;
- description = kmalloc(optlen + 1, GFP_KERNEL);
- if (!description)
- return -ENOMEM;
-
- if (copy_from_user(description, optval, optlen)) {
- kfree(description);
- return -EFAULT;
- }
- description[optlen] = 0;
+ description = memdup_user_nul(optval, optlen);
+ if (IS_ERR(description))
+ return PTR_ERR(description);
key = request_key(&key_type_keyring, description, NULL);
if (IS_ERR(key)) {
diff --git a/net/rxrpc/ar-local.c b/net/rxrpc/ar-local.c
index 78483b4602bf..4e1e6db0050b 100644
--- a/net/rxrpc/ar-local.c
+++ b/net/rxrpc/ar-local.c
@@ -323,9 +323,11 @@ void __exit rxrpc_destroy_all_locals(void)
* Reply to a version request
*/
static void rxrpc_send_version_request(struct rxrpc_local *local,
- struct rxrpc_header *hdr,
+ struct rxrpc_host_header *hdr,
struct sk_buff *skb)
{
+ struct rxrpc_wire_header whdr;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct sockaddr_in sin;
struct msghdr msg;
struct kvec iov[2];
@@ -344,15 +346,20 @@ static void rxrpc_send_version_request(struct rxrpc_local *local,
msg.msg_controllen = 0;
msg.msg_flags = 0;
- hdr->seq = 0;
- hdr->serial = 0;
- hdr->type = RXRPC_PACKET_TYPE_VERSION;
- hdr->flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED);
- hdr->userStatus = 0;
- hdr->_rsvd = 0;
-
- iov[0].iov_base = hdr;
- iov[0].iov_len = sizeof(*hdr);
+ whdr.epoch = htonl(sp->hdr.epoch);
+ whdr.cid = htonl(sp->hdr.cid);
+ whdr.callNumber = htonl(sp->hdr.callNumber);
+ whdr.seq = 0;
+ whdr.serial = 0;
+ whdr.type = RXRPC_PACKET_TYPE_VERSION;
+ whdr.flags = RXRPC_LAST_PACKET | (~hdr->flags & RXRPC_CLIENT_INITIATED);
+ whdr.userStatus = 0;
+ whdr.securityIndex = 0;
+ whdr._rsvd = 0;
+ whdr.serviceId = htons(sp->hdr.serviceId);
+
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
iov[1].iov_base = (char *)rxrpc_version_string;
iov[1].iov_len = sizeof(rxrpc_version_string);
@@ -383,7 +390,7 @@ static void rxrpc_process_local_events(struct work_struct *work)
while ((skb = skb_dequeue(&local->event_queue))) {
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
- kdebug("{%d},{%u}", local->debug_id, sp->hdr.type);
+ _debug("{%d},{%u}", local->debug_id, sp->hdr.type);
switch (sp->hdr.type) {
case RXRPC_PACKET_TYPE_VERSION:
diff --git a/net/rxrpc/ar-output.c b/net/rxrpc/ar-output.c
index a40d3afe93b7..d36fb6e1a29c 100644
--- a/net/rxrpc/ar-output.c
+++ b/net/rxrpc/ar-output.c
@@ -21,7 +21,7 @@
/*
* Time till packet resend (in jiffies).
*/
-unsigned rxrpc_resend_timeout = 4 * HZ;
+unsigned int rxrpc_resend_timeout = 4 * HZ;
static int rxrpc_send_data(struct rxrpc_sock *rx,
struct rxrpc_call *call,
@@ -111,11 +111,11 @@ static void rxrpc_send_abort(struct rxrpc_call *call, u32 abort_code)
if (call->state <= RXRPC_CALL_COMPLETE) {
call->state = RXRPC_CALL_LOCALLY_ABORTED;
call->abort_code = abort_code;
- set_bit(RXRPC_CALL_ABORT, &call->events);
+ set_bit(RXRPC_CALL_EV_ABORT, &call->events);
del_timer_sync(&call->resend_timer);
del_timer_sync(&call->ack_timer);
- clear_bit(RXRPC_CALL_RESEND_TIMER, &call->events);
- clear_bit(RXRPC_CALL_ACK, &call->events);
+ clear_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events);
+ clear_bit(RXRPC_CALL_EV_ACK, &call->events);
clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
rxrpc_queue_call(call);
}
@@ -136,7 +136,7 @@ int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans,
struct rxrpc_call *call;
unsigned long user_call_ID = 0;
struct key *key;
- __be16 service_id;
+ u16 service_id;
u32 abort_code = 0;
int ret;
@@ -151,11 +151,11 @@ int rxrpc_client_sendmsg(struct rxrpc_sock *rx, struct rxrpc_transport *trans,
bundle = NULL;
if (trans) {
- service_id = rx->service_id;
+ service_id = rx->srx.srx_service;
if (msg->msg_name) {
DECLARE_SOCKADDR(struct sockaddr_rxrpc *, srx,
msg->msg_name);
- service_id = htons(srx->srx_service);
+ service_id = srx->srx_service;
}
key = rx->key;
if (key && !rx->key->payload.data[0])
@@ -348,7 +348,7 @@ int rxrpc_send_packet(struct rxrpc_transport *trans, struct sk_buff *skb)
/* send the packet with the don't fragment bit set if we currently
* think it's small enough */
- if (skb->len - sizeof(struct rxrpc_header) < trans->peer->maxdata) {
+ if (skb->len - sizeof(struct rxrpc_wire_header) < trans->peer->maxdata) {
down_read(&trans->local->defrag_sem);
/* send the packet by UDP
* - returns -EMSGSIZE if UDP would have to fragment the packet
@@ -401,7 +401,8 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
int ret;
_enter(",{%d},%ld",
- CIRC_SPACE(call->acks_head, call->acks_tail, call->acks_winsz),
+ CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
+ call->acks_winsz),
*timeo);
add_wait_queue(&call->tx_waitq, &myself);
@@ -409,7 +410,7 @@ static int rxrpc_wait_for_tx_window(struct rxrpc_sock *rx,
for (;;) {
set_current_state(TASK_INTERRUPTIBLE);
ret = 0;
- if (CIRC_SPACE(call->acks_head, call->acks_tail,
+ if (CIRC_SPACE(call->acks_head, ACCESS_ONCE(call->acks_tail),
call->acks_winsz) > 0)
break;
if (signal_pending(current)) {
@@ -437,7 +438,7 @@ static inline void rxrpc_instant_resend(struct rxrpc_call *call)
if (try_to_del_timer_sync(&call->resend_timer) >= 0) {
clear_bit(RXRPC_CALL_RUN_RTIMER, &call->flags);
if (call->state < RXRPC_CALL_COMPLETE &&
- !test_and_set_bit(RXRPC_CALL_RESEND_TIMER, &call->events))
+ !test_and_set_bit(RXRPC_CALL_EV_RESEND_TIMER, &call->events))
rxrpc_queue_call(call);
}
read_unlock_bh(&call->state_lock);
@@ -480,8 +481,7 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
write_unlock_bh(&call->state_lock);
}
- _proto("Tx DATA %%%u { #%u }",
- ntohl(sp->hdr.serial), ntohl(sp->hdr.seq));
+ _proto("Tx DATA %%%u { #%u }", sp->hdr.serial, sp->hdr.seq);
sp->need_resend = false;
sp->resend_at = jiffies + rxrpc_resend_timeout;
@@ -513,6 +513,29 @@ static void rxrpc_queue_packet(struct rxrpc_call *call, struct sk_buff *skb,
}
/*
+ * Convert a host-endian header into a network-endian header.
+ */
+static void rxrpc_insert_header(struct sk_buff *skb)
+{
+ struct rxrpc_wire_header whdr;
+ struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
+
+ whdr.epoch = htonl(sp->hdr.epoch);
+ whdr.cid = htonl(sp->hdr.cid);
+ whdr.callNumber = htonl(sp->hdr.callNumber);
+ whdr.seq = htonl(sp->hdr.seq);
+ whdr.serial = htonl(sp->hdr.serial);
+ whdr.type = sp->hdr.type;
+ whdr.flags = sp->hdr.flags;
+ whdr.userStatus = sp->hdr.userStatus;
+ whdr.securityIndex = sp->hdr.securityIndex;
+ whdr._rsvd = htons(sp->hdr._rsvd);
+ whdr.serviceId = htons(sp->hdr.serviceId);
+
+ memcpy(skb->head, &whdr, sizeof(whdr));
+}
+
+/*
* send data through a socket
* - must be called in process context
* - caller holds the socket locked
@@ -531,7 +554,7 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
/* this should be in poll */
- clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
return -EPIPE;
@@ -548,7 +571,8 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
_debug("alloc");
- if (CIRC_SPACE(call->acks_head, call->acks_tail,
+ if (CIRC_SPACE(call->acks_head,
+ ACCESS_ONCE(call->acks_tail),
call->acks_winsz) <= 0) {
ret = -EAGAIN;
if (msg->msg_flags & MSG_DONTWAIT)
@@ -650,22 +674,22 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
seq = atomic_inc_return(&call->sequence);
- sp->hdr.epoch = conn->epoch;
- sp->hdr.cid = call->cid;
+ sp->hdr.epoch = conn->epoch;
+ sp->hdr.cid = call->cid;
sp->hdr.callNumber = call->call_id;
- sp->hdr.seq = htonl(seq);
- sp->hdr.serial =
- htonl(atomic_inc_return(&conn->serial));
- sp->hdr.type = RXRPC_PACKET_TYPE_DATA;
+ sp->hdr.seq = seq;
+ sp->hdr.serial = atomic_inc_return(&conn->serial);
+ sp->hdr.type = RXRPC_PACKET_TYPE_DATA;
sp->hdr.userStatus = 0;
sp->hdr.securityIndex = conn->security_ix;
- sp->hdr._rsvd = 0;
- sp->hdr.serviceId = conn->service_id;
+ sp->hdr._rsvd = 0;
+ sp->hdr.serviceId = call->service_id;
sp->hdr.flags = conn->out_clientflag;
if (msg_data_left(msg) == 0 && !more)
sp->hdr.flags |= RXRPC_LAST_PACKET;
- else if (CIRC_SPACE(call->acks_head, call->acks_tail,
+ else if (CIRC_SPACE(call->acks_head,
+ ACCESS_ONCE(call->acks_tail),
call->acks_winsz) > 1)
sp->hdr.flags |= RXRPC_MORE_PACKETS;
if (more && seq & 1)
@@ -673,12 +697,11 @@ static int rxrpc_send_data(struct rxrpc_sock *rx,
ret = rxrpc_secure_packet(
call, skb, skb->mark,
- skb->head + sizeof(struct rxrpc_header));
+ skb->head + sizeof(struct rxrpc_wire_header));
if (ret < 0)
goto out;
- memcpy(skb->head, &sp->hdr,
- sizeof(struct rxrpc_header));
+ rxrpc_insert_header(skb);
rxrpc_queue_packet(call, skb, !msg_data_left(msg) && !more);
skb = NULL;
}
diff --git a/net/rxrpc/ar-peer.c b/net/rxrpc/ar-peer.c
index bebaa43484bc..dc089b1976aa 100644
--- a/net/rxrpc/ar-peer.c
+++ b/net/rxrpc/ar-peer.c
@@ -92,7 +92,7 @@ static struct rxrpc_peer *rxrpc_alloc_peer(struct sockaddr_rxrpc *srx,
BUG();
}
- peer->hdrsize += sizeof(struct rxrpc_header);
+ peer->hdrsize += sizeof(struct rxrpc_wire_header);
peer->maxdata = peer->mtu - peer->hdrsize;
}
diff --git a/net/rxrpc/ar-proc.c b/net/rxrpc/ar-proc.c
index 38047f713f2c..525b2ba5a8f4 100644
--- a/net/rxrpc/ar-proc.c
+++ b/net/rxrpc/ar-proc.c
@@ -74,9 +74,9 @@ static int rxrpc_call_seq_show(struct seq_file *seq, void *v)
" %-8.8s %08x %lx\n",
lbuff,
rbuff,
- ntohs(call->conn->service_id),
- ntohl(call->conn->cid),
- ntohl(call->call_id),
+ call->conn->service_id,
+ call->cid,
+ call->call_id,
call->conn->in_clientflag ? "Svc" : "Clt",
atomic_read(&call->usage),
rxrpc_call_states[call->state],
@@ -157,8 +157,8 @@ static int rxrpc_connection_seq_show(struct seq_file *seq, void *v)
" %s %08x %08x %08x\n",
lbuff,
rbuff,
- ntohs(conn->service_id),
- ntohl(conn->cid),
+ conn->service_id,
+ conn->cid,
conn->call_counter,
conn->in_clientflag ? "Svc" : "Clt",
atomic_read(&conn->usage),
diff --git a/net/rxrpc/ar-recvmsg.c b/net/rxrpc/ar-recvmsg.c
index b92beded7459..64facba24a45 100644
--- a/net/rxrpc/ar-recvmsg.c
+++ b/net/rxrpc/ar-recvmsg.c
@@ -33,7 +33,7 @@ void rxrpc_remove_user_ID(struct rxrpc_sock *rx, struct rxrpc_call *call)
read_lock_bh(&call->state_lock);
if (!test_bit(RXRPC_CALL_RELEASED, &call->flags) &&
- !test_and_set_bit(RXRPC_CALL_RELEASE, &call->events))
+ !test_and_set_bit(RXRPC_CALL_EV_RELEASE, &call->events))
rxrpc_queue_call(call);
read_unlock_bh(&call->state_lock);
}
@@ -158,7 +158,7 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
goto receive_non_data_message;
_debug("recvmsg DATA #%u { %d, %d }",
- ntohl(sp->hdr.seq), skb->len, sp->offset);
+ sp->hdr.seq, skb->len, sp->offset);
if (!continue_call) {
/* only set the control data once per recvmsg() */
@@ -169,11 +169,11 @@ int rxrpc_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
ASSERT(test_bit(RXRPC_CALL_HAS_USERID, &call->flags));
}
- ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv);
- ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1);
- call->rx_data_recv = ntohl(sp->hdr.seq);
+ ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
+ ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
+ call->rx_data_recv = sp->hdr.seq;
- ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten);
+ ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
offset = sp->offset;
copy = skb->len - offset;
@@ -364,11 +364,11 @@ void rxrpc_kernel_data_delivered(struct sk_buff *skb)
struct rxrpc_skb_priv *sp = rxrpc_skb(skb);
struct rxrpc_call *call = sp->call;
- ASSERTCMP(ntohl(sp->hdr.seq), >=, call->rx_data_recv);
- ASSERTCMP(ntohl(sp->hdr.seq), <=, call->rx_data_recv + 1);
- call->rx_data_recv = ntohl(sp->hdr.seq);
+ ASSERTCMP(sp->hdr.seq, >=, call->rx_data_recv);
+ ASSERTCMP(sp->hdr.seq, <=, call->rx_data_recv + 1);
+ call->rx_data_recv = sp->hdr.seq;
- ASSERTCMP(ntohl(sp->hdr.seq), >, call->rx_data_eaten);
+ ASSERTCMP(sp->hdr.seq, >, call->rx_data_eaten);
rxrpc_free_skb(skb);
}
diff --git a/net/rxrpc/ar-security.c b/net/rxrpc/ar-security.c
index 8334474eb26c..ceff6394a65f 100644
--- a/net/rxrpc/ar-security.c
+++ b/net/rxrpc/ar-security.c
@@ -167,11 +167,11 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
struct rxrpc_sock *rx;
struct key *key;
key_ref_t kref;
- char kdesc[5+1+3+1];
+ char kdesc[5 + 1 + 3 + 1];
_enter("");
- sprintf(kdesc, "%u:%u", ntohs(conn->service_id), conn->security_ix);
+ sprintf(kdesc, "%u:%u", conn->service_id, conn->security_ix);
sec = rxrpc_security_lookup(conn->security_ix);
if (!sec) {
@@ -182,7 +182,7 @@ int rxrpc_init_server_conn_security(struct rxrpc_connection *conn)
/* find the service */
read_lock_bh(&local->services_lock);
list_for_each_entry(rx, &local->services, listen_link) {
- if (rx->service_id == conn->service_id)
+ if (rx->srx.srx_service == conn->service_id)
goto found_service;
}
diff --git a/net/rxrpc/ar-skbuff.c b/net/rxrpc/ar-skbuff.c
index 4cfab49e329d..62a267472fce 100644
--- a/net/rxrpc/ar-skbuff.c
+++ b/net/rxrpc/ar-skbuff.c
@@ -34,7 +34,7 @@ static void rxrpc_request_final_ACK(struct rxrpc_call *call)
/* get an extra ref on the call for the final-ACK generator to
* release */
rxrpc_get_call(call);
- set_bit(RXRPC_CALL_ACK_FINAL, &call->events);
+ set_bit(RXRPC_CALL_EV_ACK_FINAL, &call->events);
if (try_to_del_timer_sync(&call->ack_timer) >= 0)
rxrpc_queue_call(call);
break;
@@ -59,7 +59,7 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call,
spin_lock_bh(&call->lock);
- _debug("hard ACK #%u", ntohl(sp->hdr.seq));
+ _debug("hard ACK #%u", sp->hdr.seq);
for (loop = 0; loop < RXRPC_ACKR_WINDOW_ASZ; loop++) {
call->ackr_window[loop] >>= 1;
@@ -67,7 +67,7 @@ static void rxrpc_hard_ACK_data(struct rxrpc_call *call,
call->ackr_window[loop + 1] << (BITS_PER_LONG - 1);
}
- seq = ntohl(sp->hdr.seq);
+ seq = sp->hdr.seq;
ASSERTCMP(seq, ==, call->rx_data_eaten + 1);
call->rx_data_eaten = seq;
@@ -133,5 +133,4 @@ void rxrpc_kernel_free_skb(struct sk_buff *skb)
{
rxrpc_free_skb(skb);
}
-
EXPORT_SYMBOL(rxrpc_kernel_free_skb);
diff --git a/net/rxrpc/ar-transport.c b/net/rxrpc/ar-transport.c
index 9946467f16b4..66a1a5676446 100644
--- a/net/rxrpc/ar-transport.c
+++ b/net/rxrpc/ar-transport.c
@@ -20,7 +20,7 @@
/*
* Time after last use at which transport record is cleaned up.
*/
-unsigned rxrpc_transport_expiry = 3600 * 24;
+unsigned int rxrpc_transport_expiry = 3600 * 24;
static void rxrpc_transport_reaper(struct work_struct *work);
@@ -51,6 +51,7 @@ static struct rxrpc_transport *rxrpc_alloc_transport(struct rxrpc_local *local,
spin_lock_init(&trans->client_lock);
rwlock_init(&trans->conn_lock);
atomic_set(&trans->usage, 1);
+ trans->conn_idcounter = peer->srx.srx_service << 16;
trans->debug_id = atomic_inc_return(&rxrpc_debug_id);
if (peer->srx.transport.family == AF_INET) {
diff --git a/net/rxrpc/rxkad.c b/net/rxrpc/rxkad.c
index d7a9ab5a9d9c..f0aeb8163688 100644
--- a/net/rxrpc/rxkad.c
+++ b/net/rxrpc/rxkad.c
@@ -9,11 +9,11 @@
* 2 of the License, or (at your option) any later version.
*/
+#include <crypto/skcipher.h>
#include <linux/module.h>
#include <linux/net.h>
#include <linux/skbuff.h>
#include <linux/udp.h>
-#include <linux/crypto.h>
#include <linux/scatterlist.h>
#include <linux/ctype.h>
#include <linux/slab.h>
@@ -53,7 +53,7 @@ MODULE_LICENSE("GPL");
* alloc routine, but since we have it to hand, we use it to decrypt RESPONSE
* packets
*/
-static struct crypto_blkcipher *rxkad_ci;
+static struct crypto_skcipher *rxkad_ci;
static DEFINE_MUTEX(rxkad_ci_mutex);
/*
@@ -61,7 +61,7 @@ static DEFINE_MUTEX(rxkad_ci_mutex);
*/
static int rxkad_init_connection_security(struct rxrpc_connection *conn)
{
- struct crypto_blkcipher *ci;
+ struct crypto_skcipher *ci;
struct rxrpc_key_token *token;
int ret;
@@ -70,15 +70,15 @@ static int rxkad_init_connection_security(struct rxrpc_connection *conn)
token = conn->key->payload.data[0];
conn->security_ix = token->security_index;
- ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
+ ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(ci)) {
_debug("no cipher");
ret = PTR_ERR(ci);
goto error;
}
- if (crypto_blkcipher_setkey(ci, token->kad->session_key,
- sizeof(token->kad->session_key)) < 0)
+ if (crypto_skcipher_setkey(ci, token->kad->session_key,
+ sizeof(token->kad->session_key)) < 0)
BUG();
switch (conn->security_level) {
@@ -113,7 +113,7 @@ error:
static void rxkad_prime_packet_security(struct rxrpc_connection *conn)
{
struct rxrpc_key_token *token;
- struct blkcipher_desc desc;
+ SKCIPHER_REQUEST_ON_STACK(req, conn->cipher);
struct scatterlist sg[2];
struct rxrpc_crypt iv;
struct {
@@ -128,21 +128,23 @@ static void rxkad_prime_packet_security(struct rxrpc_connection *conn)
token = conn->key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
- desc.tfm = conn->cipher;
- desc.info = iv.x;
- desc.flags = 0;
-
- tmpbuf.x[0] = conn->epoch;
- tmpbuf.x[1] = conn->cid;
+ tmpbuf.x[0] = htonl(conn->epoch);
+ tmpbuf.x[1] = htonl(conn->cid);
tmpbuf.x[2] = 0;
tmpbuf.x[3] = htonl(conn->security_ix);
sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
- crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+ skcipher_request_set_tfm(req, conn->cipher);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x);
+
+ crypto_skcipher_encrypt(req);
+ skcipher_request_zero(req);
memcpy(&conn->csum_iv, &tmpbuf.x[2], sizeof(conn->csum_iv));
- ASSERTCMP(conn->csum_iv.n[0], ==, tmpbuf.x[2]);
+ ASSERTCMP((u32 __force)conn->csum_iv.n[0], ==, (u32 __force)tmpbuf.x[2]);
_leave("");
}
@@ -156,7 +158,7 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
void *sechdr)
{
struct rxrpc_skb_priv *sp;
- struct blkcipher_desc desc;
+ SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist sg[2];
struct {
@@ -169,21 +171,24 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
_enter("");
- check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
- data_size |= (u32) check << 16;
+ check = sp->hdr.seq ^ sp->hdr.callNumber;
+ data_size |= (u32)check << 16;
tmpbuf.hdr.data_size = htonl(data_size);
memcpy(&tmpbuf.first, sechdr + 4, sizeof(tmpbuf.first));
/* start the encryption afresh */
memset(&iv, 0, sizeof(iv));
- desc.tfm = call->conn->cipher;
- desc.info = iv.x;
- desc.flags = 0;
sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
- crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+ skcipher_request_set_tfm(req, call->conn->cipher);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x);
+
+ crypto_skcipher_encrypt(req);
+ skcipher_request_zero(req);
memcpy(sechdr, &tmpbuf, sizeof(tmpbuf));
@@ -195,81 +200,91 @@ static int rxkad_secure_packet_auth(const struct rxrpc_call *call,
* wholly encrypt a packet (level 2 security)
*/
static int rxkad_secure_packet_encrypt(const struct rxrpc_call *call,
- struct sk_buff *skb,
- u32 data_size,
- void *sechdr)
+ struct sk_buff *skb,
+ u32 data_size,
+ void *sechdr)
{
const struct rxrpc_key_token *token;
struct rxkad_level2_hdr rxkhdr
__attribute__((aligned(8))); /* must be all on one page */
struct rxrpc_skb_priv *sp;
- struct blkcipher_desc desc;
+ SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist sg[16];
struct sk_buff *trailer;
unsigned int len;
u16 check;
int nsg;
+ int err;
sp = rxrpc_skb(skb);
_enter("");
- check = ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+ check = sp->hdr.seq ^ sp->hdr.callNumber;
- rxkhdr.data_size = htonl(data_size | (u32) check << 16);
+ rxkhdr.data_size = htonl(data_size | (u32)check << 16);
rxkhdr.checksum = 0;
/* encrypt from the session key */
token = call->conn->key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
- desc.tfm = call->conn->cipher;
- desc.info = iv.x;
- desc.flags = 0;
sg_init_one(&sg[0], sechdr, sizeof(rxkhdr));
sg_init_one(&sg[1], &rxkhdr, sizeof(rxkhdr));
- crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(rxkhdr));
+
+ skcipher_request_set_tfm(req, call->conn->cipher);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(rxkhdr), iv.x);
+
+ crypto_skcipher_encrypt(req);
/* we want to encrypt the skbuff in-place */
nsg = skb_cow_data(skb, 0, &trailer);
+ err = -ENOMEM;
if (nsg < 0 || nsg > 16)
- return -ENOMEM;
+ goto out;
len = data_size + call->conn->size_align - 1;
len &= ~(call->conn->size_align - 1);
sg_init_table(sg, nsg);
skb_to_sgvec(skb, sg, 0, len);
- crypto_blkcipher_encrypt_iv(&desc, sg, sg, len);
+
+ skcipher_request_set_crypt(req, sg, sg, len, iv.x);
+
+ crypto_skcipher_encrypt(req);
_leave(" = 0");
- return 0;
+ err = 0;
+
+out:
+ skcipher_request_zero(req);
+ return err;
}
/*
* checksum an RxRPC packet header
*/
static int rxkad_secure_packet(const struct rxrpc_call *call,
- struct sk_buff *skb,
- size_t data_size,
- void *sechdr)
+ struct sk_buff *skb,
+ size_t data_size,
+ void *sechdr)
{
struct rxrpc_skb_priv *sp;
- struct blkcipher_desc desc;
+ SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist sg[2];
struct {
__be32 x[2];
} tmpbuf __attribute__((aligned(8))); /* must all be in same page */
- __be32 x;
- u32 y;
+ u32 x, y;
int ret;
sp = rxrpc_skb(skb);
_enter("{%d{%x}},{#%u},%zu,",
- call->debug_id, key_serial(call->conn->key), ntohl(sp->hdr.seq),
+ call->debug_id, key_serial(call->conn->key), sp->hdr.seq,
data_size);
if (!call->conn->cipher)
@@ -281,25 +296,28 @@ static int rxkad_secure_packet(const struct rxrpc_call *call,
/* continue encrypting from where we left off */
memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
- desc.tfm = call->conn->cipher;
- desc.info = iv.x;
- desc.flags = 0;
/* calculate the security checksum */
- x = htonl(call->channel << (32 - RXRPC_CIDSHIFT));
- x |= sp->hdr.seq & cpu_to_be32(0x3fffffff);
- tmpbuf.x[0] = sp->hdr.callNumber;
- tmpbuf.x[1] = x;
+ x = call->channel << (32 - RXRPC_CIDSHIFT);
+ x |= sp->hdr.seq & 0x3fffffff;
+ tmpbuf.x[0] = htonl(sp->hdr.callNumber);
+ tmpbuf.x[1] = htonl(x);
sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
- crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+ skcipher_request_set_tfm(req, call->conn->cipher);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x);
+
+ crypto_skcipher_encrypt(req);
+ skcipher_request_zero(req);
y = ntohl(tmpbuf.x[1]);
y = (y >> 16) & 0xffff;
if (y == 0)
y = 1; /* zero checksums are not permitted */
- sp->hdr.cksum = htons(y);
+ sp->hdr.cksum = y;
switch (call->conn->security_level) {
case RXRPC_SECURITY_PLAIN:
@@ -330,7 +348,7 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
{
struct rxkad_level1_hdr sechdr;
struct rxrpc_skb_priv *sp;
- struct blkcipher_desc desc;
+ SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist sg[16];
struct sk_buff *trailer;
@@ -352,11 +370,13 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
/* start the decryption afresh */
memset(&iv, 0, sizeof(iv));
- desc.tfm = call->conn->cipher;
- desc.info = iv.x;
- desc.flags = 0;
- crypto_blkcipher_decrypt_iv(&desc, sg, sg, 8);
+ skcipher_request_set_tfm(req, call->conn->cipher);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, sg, sg, 8, iv.x);
+
+ crypto_skcipher_decrypt(req);
+ skcipher_request_zero(req);
/* remove the decrypted packet length */
if (skb_copy_bits(skb, 0, &sechdr, sizeof(sechdr)) < 0)
@@ -368,7 +388,7 @@ static int rxkad_verify_packet_auth(const struct rxrpc_call *call,
data_size = buf & 0xffff;
check = buf >> 16;
- check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+ check ^= sp->hdr.seq ^ sp->hdr.callNumber;
check &= 0xffff;
if (check != 0) {
*_abort_code = RXKADSEALEDINCON;
@@ -405,7 +425,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
const struct rxrpc_key_token *token;
struct rxkad_level2_hdr sechdr;
struct rxrpc_skb_priv *sp;
- struct blkcipher_desc desc;
+ SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist _sg[4], *sg;
struct sk_buff *trailer;
@@ -435,11 +455,13 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
/* decrypt from the session key */
token = call->conn->key->payload.data[0];
memcpy(&iv, token->kad->session_key, sizeof(iv));
- desc.tfm = call->conn->cipher;
- desc.info = iv.x;
- desc.flags = 0;
- crypto_blkcipher_decrypt_iv(&desc, sg, sg, skb->len);
+ skcipher_request_set_tfm(req, call->conn->cipher);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, sg, sg, skb->len, iv.x);
+
+ crypto_skcipher_decrypt(req);
+ skcipher_request_zero(req);
if (sg != _sg)
kfree(sg);
@@ -453,7 +475,7 @@ static int rxkad_verify_packet_encrypt(const struct rxrpc_call *call,
data_size = buf & 0xffff;
check = buf >> 16;
- check ^= ntohl(sp->hdr.seq ^ sp->hdr.callNumber);
+ check ^= sp->hdr.seq ^ sp->hdr.callNumber;
check &= 0xffff;
if (check != 0) {
*_abort_code = RXKADSEALEDINCON;
@@ -487,23 +509,21 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
struct sk_buff *skb,
u32 *_abort_code)
{
- struct blkcipher_desc desc;
+ SKCIPHER_REQUEST_ON_STACK(req, call->conn->cipher);
struct rxrpc_skb_priv *sp;
struct rxrpc_crypt iv;
struct scatterlist sg[2];
struct {
__be32 x[2];
} tmpbuf __attribute__((aligned(8))); /* must all be in same page */
- __be32 x;
- __be16 cksum;
- u32 y;
+ u16 cksum;
+ u32 x, y;
int ret;
sp = rxrpc_skb(skb);
_enter("{%d{%x}},{#%u}",
- call->debug_id, key_serial(call->conn->key),
- ntohl(sp->hdr.seq));
+ call->debug_id, key_serial(call->conn->key), sp->hdr.seq);
if (!call->conn->cipher)
return 0;
@@ -516,26 +536,28 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
/* continue encrypting from where we left off */
memcpy(&iv, call->conn->csum_iv.x, sizeof(iv));
- desc.tfm = call->conn->cipher;
- desc.info = iv.x;
- desc.flags = 0;
/* validate the security checksum */
- x = htonl(call->channel << (32 - RXRPC_CIDSHIFT));
- x |= sp->hdr.seq & cpu_to_be32(0x3fffffff);
- tmpbuf.x[0] = call->call_id;
- tmpbuf.x[1] = x;
+ x = call->channel << (32 - RXRPC_CIDSHIFT);
+ x |= sp->hdr.seq & 0x3fffffff;
+ tmpbuf.x[0] = htonl(call->call_id);
+ tmpbuf.x[1] = htonl(x);
sg_init_one(&sg[0], &tmpbuf, sizeof(tmpbuf));
sg_init_one(&sg[1], &tmpbuf, sizeof(tmpbuf));
- crypto_blkcipher_encrypt_iv(&desc, &sg[0], &sg[1], sizeof(tmpbuf));
+
+ skcipher_request_set_tfm(req, call->conn->cipher);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &sg[1], &sg[0], sizeof(tmpbuf), iv.x);
+
+ crypto_skcipher_encrypt(req);
+ skcipher_request_zero(req);
y = ntohl(tmpbuf.x[1]);
- y = (y >> 16) & 0xffff;
- if (y == 0)
- y = 1; /* zero checksums are not permitted */
+ cksum = (y >> 16) & 0xffff;
+ if (cksum == 0)
+ cksum = 1; /* zero checksums are not permitted */
- cksum = htons(y);
if (sp->hdr.cksum != cksum) {
*_abort_code = RXKADSEALEDINCON;
_leave(" = -EPROTO [csum failed]");
@@ -567,10 +589,11 @@ static int rxkad_verify_packet(const struct rxrpc_call *call,
static int rxkad_issue_challenge(struct rxrpc_connection *conn)
{
struct rxkad_challenge challenge;
- struct rxrpc_header hdr;
+ struct rxrpc_wire_header whdr;
struct msghdr msg;
struct kvec iov[2];
size_t len;
+ u32 serial;
int ret;
_enter("{%d,%x}", conn->debug_id, key_serial(conn->key));
@@ -592,26 +615,27 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
msg.msg_controllen = 0;
msg.msg_flags = 0;
- hdr.epoch = conn->epoch;
- hdr.cid = conn->cid;
- hdr.callNumber = 0;
- hdr.seq = 0;
- hdr.type = RXRPC_PACKET_TYPE_CHALLENGE;
- hdr.flags = conn->out_clientflag;
- hdr.userStatus = 0;
- hdr.securityIndex = conn->security_ix;
- hdr._rsvd = 0;
- hdr.serviceId = conn->service_id;
-
- iov[0].iov_base = &hdr;
- iov[0].iov_len = sizeof(hdr);
+ whdr.epoch = htonl(conn->epoch);
+ whdr.cid = htonl(conn->cid);
+ whdr.callNumber = 0;
+ whdr.seq = 0;
+ whdr.type = RXRPC_PACKET_TYPE_CHALLENGE;
+ whdr.flags = conn->out_clientflag;
+ whdr.userStatus = 0;
+ whdr.securityIndex = conn->security_ix;
+ whdr._rsvd = 0;
+ whdr.serviceId = htons(conn->service_id);
+
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
iov[1].iov_base = &challenge;
iov[1].iov_len = sizeof(challenge);
len = iov[0].iov_len + iov[1].iov_len;
- hdr.serial = htonl(atomic_inc_return(&conn->serial));
- _proto("Tx CHALLENGE %%%u", ntohl(hdr.serial));
+ serial = atomic_inc_return(&conn->serial);
+ whdr.serial = htonl(serial);
+ _proto("Tx CHALLENGE %%%u", serial);
ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 2, len);
if (ret < 0) {
@@ -627,13 +651,15 @@ static int rxkad_issue_challenge(struct rxrpc_connection *conn)
* send a Kerberos security response
*/
static int rxkad_send_response(struct rxrpc_connection *conn,
- struct rxrpc_header *hdr,
+ struct rxrpc_host_header *hdr,
struct rxkad_response *resp,
const struct rxkad_key *s2)
{
+ struct rxrpc_wire_header whdr;
struct msghdr msg;
struct kvec iov[3];
size_t len;
+ u32 serial;
int ret;
_enter("");
@@ -644,24 +670,26 @@ static int rxkad_send_response(struct rxrpc_connection *conn,
msg.msg_controllen = 0;
msg.msg_flags = 0;
- hdr->epoch = conn->epoch;
- hdr->seq = 0;
- hdr->type = RXRPC_PACKET_TYPE_RESPONSE;
- hdr->flags = conn->out_clientflag;
- hdr->userStatus = 0;
- hdr->_rsvd = 0;
+ memset(&whdr, 0, sizeof(whdr));
+ whdr.epoch = htonl(hdr->epoch);
+ whdr.cid = htonl(hdr->cid);
+ whdr.type = RXRPC_PACKET_TYPE_RESPONSE;
+ whdr.flags = conn->out_clientflag;
+ whdr.securityIndex = hdr->securityIndex;
+ whdr.serviceId = htons(hdr->serviceId);
- iov[0].iov_base = hdr;
- iov[0].iov_len = sizeof(*hdr);
+ iov[0].iov_base = &whdr;
+ iov[0].iov_len = sizeof(whdr);
iov[1].iov_base = resp;
iov[1].iov_len = sizeof(*resp);
- iov[2].iov_base = (void *) s2->ticket;
+ iov[2].iov_base = (void *)s2->ticket;
iov[2].iov_len = s2->ticket_len;
len = iov[0].iov_len + iov[1].iov_len + iov[2].iov_len;
- hdr->serial = htonl(atomic_inc_return(&conn->serial));
- _proto("Tx RESPONSE %%%u", ntohl(hdr->serial));
+ serial = atomic_inc_return(&conn->serial);
+ whdr.serial = htonl(serial);
+ _proto("Tx RESPONSE %%%u", serial);
ret = kernel_sendmsg(conn->trans->local->socket, &msg, iov, 3, len);
if (ret < 0) {
@@ -718,18 +746,21 @@ static void rxkad_encrypt_response(struct rxrpc_connection *conn,
struct rxkad_response *resp,
const struct rxkad_key *s2)
{
- struct blkcipher_desc desc;
+ SKCIPHER_REQUEST_ON_STACK(req, conn->cipher);
struct rxrpc_crypt iv;
struct scatterlist sg[2];
/* continue encrypting from where we left off */
memcpy(&iv, s2->session_key, sizeof(iv));
- desc.tfm = conn->cipher;
- desc.info = iv.x;
- desc.flags = 0;
rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted));
- crypto_blkcipher_encrypt_iv(&desc, sg, sg, sizeof(resp->encrypted));
+
+ skcipher_request_set_tfm(req, conn->cipher);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x);
+
+ crypto_skcipher_encrypt(req);
+ skcipher_request_zero(req);
}
/*
@@ -770,7 +801,7 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
min_level = ntohl(challenge.min_level);
_proto("Rx CHALLENGE %%%u { v=%u n=%u ml=%u }",
- ntohl(sp->hdr.serial), version, nonce, min_level);
+ sp->hdr.serial, version, nonce, min_level);
abort_code = RXKADINCONSISTENCY;
if (version != RXKAD_VERSION)
@@ -785,22 +816,23 @@ static int rxkad_respond_to_challenge(struct rxrpc_connection *conn,
/* build the response packet */
memset(&resp, 0, sizeof(resp));
- resp.version = RXKAD_VERSION;
- resp.encrypted.epoch = conn->epoch;
- resp.encrypted.cid = conn->cid;
- resp.encrypted.securityIndex = htonl(conn->security_ix);
+ resp.version = htonl(RXKAD_VERSION);
+ resp.encrypted.epoch = htonl(conn->epoch);
+ resp.encrypted.cid = htonl(conn->cid);
+ resp.encrypted.securityIndex = htonl(conn->security_ix);
+ resp.encrypted.inc_nonce = htonl(nonce + 1);
+ resp.encrypted.level = htonl(conn->security_level);
+ resp.kvno = htonl(token->kad->kvno);
+ resp.ticket_len = htonl(token->kad->ticket_len);
+
resp.encrypted.call_id[0] =
- (conn->channels[0] ? conn->channels[0]->call_id : 0);
+ htonl(conn->channels[0] ? conn->channels[0]->call_id : 0);
resp.encrypted.call_id[1] =
- (conn->channels[1] ? conn->channels[1]->call_id : 0);
+ htonl(conn->channels[1] ? conn->channels[1]->call_id : 0);
resp.encrypted.call_id[2] =
- (conn->channels[2] ? conn->channels[2]->call_id : 0);
+ htonl(conn->channels[2] ? conn->channels[2]->call_id : 0);
resp.encrypted.call_id[3] =
- (conn->channels[3] ? conn->channels[3]->call_id : 0);
- resp.encrypted.inc_nonce = htonl(nonce + 1);
- resp.encrypted.level = htonl(conn->security_level);
- resp.kvno = htonl(token->kad->kvno);
- resp.ticket_len = htonl(token->kad->ticket_len);
+ htonl(conn->channels[3] ? conn->channels[3]->call_id : 0);
/* calculate the response checksum and then do the encryption */
rxkad_calc_response_checksum(&resp);
@@ -822,7 +854,7 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
time_t *_expiry,
u32 *_abort_code)
{
- struct blkcipher_desc desc;
+ struct skcipher_request *req;
struct rxrpc_crypt iv, key;
struct scatterlist sg[1];
struct in_addr addr;
@@ -853,12 +885,21 @@ static int rxkad_decrypt_ticket(struct rxrpc_connection *conn,
memcpy(&iv, &conn->server_key->payload.data[2], sizeof(iv));
- desc.tfm = conn->server_key->payload.data[0];
- desc.info = iv.x;
- desc.flags = 0;
+ req = skcipher_request_alloc(conn->server_key->payload.data[0],
+ GFP_NOFS);
+ if (!req) {
+ *_abort_code = RXKADNOAUTH;
+ ret = -ENOMEM;
+ goto error;
+ }
sg_init_one(&sg[0], ticket, ticket_len);
- crypto_blkcipher_decrypt_iv(&desc, sg, sg, ticket_len);
+
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, sg, sg, ticket_len, iv.x);
+
+ crypto_skcipher_decrypt(req);
+ skcipher_request_free(req);
p = ticket;
end = p + ticket_len;
@@ -966,7 +1007,7 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn,
struct rxkad_response *resp,
const struct rxrpc_crypt *session_key)
{
- struct blkcipher_desc desc;
+ SKCIPHER_REQUEST_ON_STACK(req, rxkad_ci);
struct scatterlist sg[2];
struct rxrpc_crypt iv;
@@ -976,17 +1017,21 @@ static void rxkad_decrypt_response(struct rxrpc_connection *conn,
ASSERT(rxkad_ci != NULL);
mutex_lock(&rxkad_ci_mutex);
- if (crypto_blkcipher_setkey(rxkad_ci, session_key->x,
- sizeof(*session_key)) < 0)
+ if (crypto_skcipher_setkey(rxkad_ci, session_key->x,
+ sizeof(*session_key)) < 0)
BUG();
memcpy(&iv, session_key, sizeof(iv));
- desc.tfm = rxkad_ci;
- desc.info = iv.x;
- desc.flags = 0;
rxkad_sg_set_buf2(sg, &resp->encrypted, sizeof(resp->encrypted));
- crypto_blkcipher_decrypt_iv(&desc, sg, sg, sizeof(resp->encrypted));
+
+ skcipher_request_set_tfm(req, rxkad_ci);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, sg, sg, sizeof(resp->encrypted), iv.x);
+
+ crypto_skcipher_decrypt(req);
+ skcipher_request_zero(req);
+
mutex_unlock(&rxkad_ci_mutex);
_leave("");
@@ -1022,7 +1067,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
kvno = ntohl(response.kvno);
sp = rxrpc_skb(skb);
_proto("Rx RESPONSE %%%u { v=%u kv=%u tl=%u }",
- ntohl(sp->hdr.serial), version, kvno, ticket_len);
+ sp->hdr.serial, version, kvno, ticket_len);
abort_code = RXKADINCONSISTENCY;
if (version != RXKAD_VERSION)
@@ -1058,9 +1103,9 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
rxkad_decrypt_response(conn, &response, &session_key);
abort_code = RXKADSEALEDINCON;
- if (response.encrypted.epoch != conn->epoch)
+ if (ntohl(response.encrypted.epoch) != conn->epoch)
goto protocol_error_free;
- if (response.encrypted.cid != conn->cid)
+ if (ntohl(response.encrypted.cid) != conn->cid)
goto protocol_error_free;
if (ntohl(response.encrypted.securityIndex) != conn->security_ix)
goto protocol_error_free;
@@ -1077,7 +1122,7 @@ static int rxkad_verify_response(struct rxrpc_connection *conn,
goto protocol_error_free;
abort_code = RXKADOUTOFSEQUENCE;
- if (response.encrypted.inc_nonce != htonl(conn->security_nonce + 1))
+ if (ntohl(response.encrypted.inc_nonce) != conn->security_nonce + 1)
goto protocol_error_free;
abort_code = RXKADLEVELFAIL;
@@ -1115,7 +1160,7 @@ static void rxkad_clear(struct rxrpc_connection *conn)
_enter("");
if (conn->cipher)
- crypto_free_blkcipher(conn->cipher);
+ crypto_free_skcipher(conn->cipher);
}
/*
@@ -1141,7 +1186,7 @@ static __init int rxkad_init(void)
/* pin the cipher we need so that the crypto layer doesn't invoke
* keventd to go get it */
- rxkad_ci = crypto_alloc_blkcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
+ rxkad_ci = crypto_alloc_skcipher("pcbc(fcrypt)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(rxkad_ci))
return PTR_ERR(rxkad_ci);
@@ -1155,7 +1200,7 @@ static __exit void rxkad_exit(void)
_enter("");
rxrpc_unregister_security(&rxkad);
- crypto_free_blkcipher(rxkad_ci);
+ crypto_free_skcipher(rxkad_ci);
}
module_exit(rxkad_exit);
diff --git a/net/rxrpc/sysctl.c b/net/rxrpc/sysctl.c
index 50a98a910eb1..d20ed575acf4 100644
--- a/net/rxrpc/sysctl.c
+++ b/net/rxrpc/sysctl.c
@@ -15,11 +15,11 @@
#include "ar-internal.h"
static struct ctl_table_header *rxrpc_sysctl_reg_table;
-static const unsigned zero = 0;
-static const unsigned one = 1;
-static const unsigned four = 4;
-static const unsigned n_65535 = 65535;
-static const unsigned n_max_acks = RXRPC_MAXACKS;
+static const unsigned int zero = 0;
+static const unsigned int one = 1;
+static const unsigned int four = 4;
+static const unsigned int n_65535 = 65535;
+static const unsigned int n_max_acks = RXRPC_MAXACKS;
/*
* RxRPC operating parameters.
@@ -32,7 +32,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
{
.procname = "req_ack_delay",
.data = &rxrpc_requested_ack_delay,
- .maxlen = sizeof(unsigned),
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
.extra1 = (void *)&zero,
@@ -40,7 +40,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
{
.procname = "soft_ack_delay",
.data = &rxrpc_soft_ack_delay,
- .maxlen = sizeof(unsigned),
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
.extra1 = (void *)&one,
@@ -48,7 +48,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
{
.procname = "idle_ack_delay",
.data = &rxrpc_idle_ack_delay,
- .maxlen = sizeof(unsigned),
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
.extra1 = (void *)&one,
@@ -56,7 +56,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
{
.procname = "resend_timeout",
.data = &rxrpc_resend_timeout,
- .maxlen = sizeof(unsigned),
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_ms_jiffies,
.extra1 = (void *)&one,
@@ -66,7 +66,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
{
.procname = "max_call_lifetime",
.data = &rxrpc_max_call_lifetime,
- .maxlen = sizeof(unsigned),
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
.extra1 = (void *)&one,
@@ -74,7 +74,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
{
.procname = "dead_call_expiry",
.data = &rxrpc_dead_call_expiry,
- .maxlen = sizeof(unsigned),
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_jiffies,
.extra1 = (void *)&one,
@@ -84,7 +84,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
{
.procname = "connection_expiry",
.data = &rxrpc_connection_expiry,
- .maxlen = sizeof(unsigned),
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = (void *)&one,
@@ -92,7 +92,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
{
.procname = "transport_expiry",
.data = &rxrpc_transport_expiry,
- .maxlen = sizeof(unsigned),
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = (void *)&one,
@@ -102,7 +102,7 @@ static struct ctl_table rxrpc_sysctl_table[] = {
{
.procname = "rx_window_size",
.data = &rxrpc_rx_window_size,
- .maxlen = sizeof(unsigned),
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = (void *)&one,
@@ -111,16 +111,16 @@ static struct ctl_table rxrpc_sysctl_table[] = {
{
.procname = "rx_mtu",
.data = &rxrpc_rx_mtu,
- .maxlen = sizeof(unsigned),
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = (void *)&one,
- .extra1 = (void *)&n_65535,
+ .extra2 = (void *)&n_65535,
},
{
.procname = "rx_jumbo_max",
.data = &rxrpc_rx_jumbo_max,
- .maxlen = sizeof(unsigned),
+ .maxlen = sizeof(unsigned int),
.mode = 0644,
.proc_handler = proc_dointvec_minmax,
.extra1 = (void *)&one,
diff --git a/net/sched/Kconfig b/net/sched/Kconfig
index daa33432b716..b148302bbaf2 100644
--- a/net/sched/Kconfig
+++ b/net/sched/Kconfig
@@ -310,15 +310,21 @@ config NET_SCH_PIE
If unsure, say N.
config NET_SCH_INGRESS
- tristate "Ingress Qdisc"
+ tristate "Ingress/classifier-action Qdisc"
depends on NET_CLS_ACT
select NET_INGRESS
+ select NET_EGRESS
---help---
- Say Y here if you want to use classifiers for incoming packets.
+ Say Y here if you want to use classifiers for incoming and/or outgoing
+ packets. This qdisc doesn't do anything else besides running classifiers,
+ which can also have actions attached to them. In case of outgoing packets,
+ classifiers that this qdisc holds are executed in the transmit path
+ before real enqueuing to an egress qdisc happens.
+
If unsure, say Y.
- To compile this code as a module, choose M here: the
- module will be called sch_ingress.
+ To compile this code as a module, choose M here: the module will be
+ called sch_ingress with alias of sch_clsact.
config NET_SCH_PLUG
tristate "Plug network traffic until release (PLUG)"
@@ -733,6 +739,28 @@ config NET_ACT_CONNMARK
To compile this code as a module, choose M here: the
module will be called act_connmark.
+config NET_ACT_IFE
+ tristate "Inter-FE action based on IETF ForCES InterFE LFB"
+ depends on NET_CLS_ACT
+ ---help---
+ Say Y here to allow for sourcing and terminating metadata
+ For details refer to netdev01 paper:
+ "Distributing Linux Traffic Control Classifier-Action Subsystem"
+ Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+
+ To compile this code as a module, choose M here: the
+ module will be called act_ife.
+
+config NET_IFE_SKBMARK
+ tristate "Support to encoding decoding skb mark on IFE action"
+ depends on NET_ACT_IFE
+ ---help---
+
+config NET_IFE_SKBPRIO
+ tristate "Support to encoding decoding skb prio on IFE action"
+ depends on NET_ACT_IFE
+ ---help---
+
config NET_CLS_IND
bool "Incoming device classification"
depends on NET_CLS_U32 || NET_CLS_FW
diff --git a/net/sched/Makefile b/net/sched/Makefile
index 690c1689e090..84bddb373517 100644
--- a/net/sched/Makefile
+++ b/net/sched/Makefile
@@ -19,6 +19,9 @@ obj-$(CONFIG_NET_ACT_CSUM) += act_csum.o
obj-$(CONFIG_NET_ACT_VLAN) += act_vlan.o
obj-$(CONFIG_NET_ACT_BPF) += act_bpf.o
obj-$(CONFIG_NET_ACT_CONNMARK) += act_connmark.o
+obj-$(CONFIG_NET_ACT_IFE) += act_ife.o
+obj-$(CONFIG_NET_IFE_SKBMARK) += act_meta_mark.o
+obj-$(CONFIG_NET_IFE_SKBPRIO) += act_meta_skbprio.o
obj-$(CONFIG_NET_SCH_FIFO) += sch_fifo.o
obj-$(CONFIG_NET_SCH_CBQ) += sch_cbq.o
obj-$(CONFIG_NET_SCH_HTB) += sch_htb.o
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 06e7c4a37245..96066665e376 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -36,10 +36,9 @@ static void free_tcf(struct rcu_head *head)
kfree(p);
}
-static void tcf_hash_destroy(struct tc_action *a)
+static void tcf_hash_destroy(struct tcf_hashinfo *hinfo, struct tc_action *a)
{
struct tcf_common *p = a->priv;
- struct tcf_hashinfo *hinfo = a->ops->hinfo;
spin_lock_bh(&hinfo->lock);
hlist_del(&p->tcfc_head);
@@ -68,8 +67,8 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
if (p->tcfc_bindcnt <= 0 && p->tcfc_refcnt <= 0) {
if (a->ops->cleanup)
a->ops->cleanup(a, bind);
- tcf_hash_destroy(a);
- ret = 1;
+ tcf_hash_destroy(a->hinfo, a);
+ ret = ACT_P_DELETED;
}
}
@@ -77,10 +76,9 @@ int __tcf_hash_release(struct tc_action *a, bool bind, bool strict)
}
EXPORT_SYMBOL(__tcf_hash_release);
-static int tcf_dump_walker(struct sk_buff *skb, struct netlink_callback *cb,
- struct tc_action *a)
+static int tcf_dump_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
+ struct netlink_callback *cb, struct tc_action *a)
{
- struct tcf_hashinfo *hinfo = a->ops->hinfo;
struct hlist_head *head;
struct tcf_common *p;
int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
@@ -126,9 +124,9 @@ nla_put_failure:
goto done;
}
-static int tcf_del_walker(struct sk_buff *skb, struct tc_action *a)
+static int tcf_del_walker(struct tcf_hashinfo *hinfo, struct sk_buff *skb,
+ struct tc_action *a)
{
- struct tcf_hashinfo *hinfo = a->ops->hinfo;
struct hlist_head *head;
struct hlist_node *n;
struct tcf_common *p;
@@ -163,18 +161,24 @@ nla_put_failure:
return ret;
}
-static int tcf_generic_walker(struct sk_buff *skb, struct netlink_callback *cb,
- int type, struct tc_action *a)
+int tcf_generic_walker(struct tc_action_net *tn, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ struct tc_action *a)
{
+ struct tcf_hashinfo *hinfo = tn->hinfo;
+
+ a->hinfo = hinfo;
+
if (type == RTM_DELACTION) {
- return tcf_del_walker(skb, a);
+ return tcf_del_walker(hinfo, skb, a);
} else if (type == RTM_GETACTION) {
- return tcf_dump_walker(skb, cb, a);
+ return tcf_dump_walker(hinfo, skb, cb, a);
} else {
WARN(1, "tcf_generic_walker: unknown action %d\n", type);
return -EINVAL;
}
}
+EXPORT_SYMBOL(tcf_generic_walker);
static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
{
@@ -191,8 +195,9 @@ static struct tcf_common *tcf_hash_lookup(u32 index, struct tcf_hashinfo *hinfo)
return p;
}
-u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo)
+u32 tcf_hash_new_index(struct tc_action_net *tn)
{
+ struct tcf_hashinfo *hinfo = tn->hinfo;
u32 val = hinfo->index;
do {
@@ -205,28 +210,31 @@ u32 tcf_hash_new_index(struct tcf_hashinfo *hinfo)
}
EXPORT_SYMBOL(tcf_hash_new_index);
-int tcf_hash_search(struct tc_action *a, u32 index)
+int tcf_hash_search(struct tc_action_net *tn, struct tc_action *a, u32 index)
{
- struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ struct tcf_hashinfo *hinfo = tn->hinfo;
struct tcf_common *p = tcf_hash_lookup(index, hinfo);
if (p) {
a->priv = p;
+ a->hinfo = hinfo;
return 1;
}
return 0;
}
EXPORT_SYMBOL(tcf_hash_search);
-int tcf_hash_check(u32 index, struct tc_action *a, int bind)
+int tcf_hash_check(struct tc_action_net *tn, u32 index, struct tc_action *a,
+ int bind)
{
- struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ struct tcf_hashinfo *hinfo = tn->hinfo;
struct tcf_common *p = NULL;
if (index && (p = tcf_hash_lookup(index, hinfo)) != NULL) {
if (bind)
p->tcfc_bindcnt++;
p->tcfc_refcnt++;
a->priv = p;
+ a->hinfo = hinfo;
return 1;
}
return 0;
@@ -243,11 +251,11 @@ void tcf_hash_cleanup(struct tc_action *a, struct nlattr *est)
}
EXPORT_SYMBOL(tcf_hash_cleanup);
-int tcf_hash_create(u32 index, struct nlattr *est, struct tc_action *a,
- int size, int bind, bool cpustats)
+int tcf_hash_create(struct tc_action_net *tn, u32 index, struct nlattr *est,
+ struct tc_action *a, int size, int bind, bool cpustats)
{
- struct tcf_hashinfo *hinfo = a->ops->hinfo;
struct tcf_common *p = kzalloc(size, GFP_KERNEL);
+ struct tcf_hashinfo *hinfo = tn->hinfo;
int err = -ENOMEM;
if (unlikely(!p))
@@ -272,7 +280,7 @@ err2:
}
spin_lock_init(&p->tcfc_lock);
INIT_HLIST_NODE(&p->tcfc_head);
- p->tcfc_index = index ? index : tcf_hash_new_index(hinfo);
+ p->tcfc_index = index ? index : tcf_hash_new_index(tn);
p->tcfc_tm.install = jiffies;
p->tcfc_tm.lastuse = jiffies;
if (est) {
@@ -286,14 +294,15 @@ err2:
}
a->priv = (void *) p;
+ a->hinfo = hinfo;
return 0;
}
EXPORT_SYMBOL(tcf_hash_create);
-void tcf_hash_insert(struct tc_action *a)
+void tcf_hash_insert(struct tc_action_net *tn, struct tc_action *a)
{
struct tcf_common *p = a->priv;
- struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ struct tcf_hashinfo *hinfo = tn->hinfo;
unsigned int h = tcf_hash(p->tcfc_index, hinfo->hmask);
spin_lock_bh(&hinfo->lock);
@@ -302,59 +311,78 @@ void tcf_hash_insert(struct tc_action *a)
}
EXPORT_SYMBOL(tcf_hash_insert);
+void tcf_hashinfo_destroy(const struct tc_action_ops *ops,
+ struct tcf_hashinfo *hinfo)
+{
+ struct tc_action a = {
+ .ops = ops,
+ .hinfo = hinfo,
+ };
+ int i;
+
+ for (i = 0; i < hinfo->hmask + 1; i++) {
+ struct tcf_common *p;
+ struct hlist_node *n;
+
+ hlist_for_each_entry_safe(p, n, &hinfo->htab[i], tcfc_head) {
+ int ret;
+
+ a.priv = p;
+ ret = __tcf_hash_release(&a, false, true);
+ if (ret == ACT_P_DELETED)
+ module_put(ops->owner);
+ else if (ret < 0)
+ return;
+ }
+ }
+ kfree(hinfo->htab);
+}
+EXPORT_SYMBOL(tcf_hashinfo_destroy);
+
static LIST_HEAD(act_base);
static DEFINE_RWLOCK(act_mod_lock);
-int tcf_register_action(struct tc_action_ops *act, unsigned int mask)
+int tcf_register_action(struct tc_action_ops *act,
+ struct pernet_operations *ops)
{
struct tc_action_ops *a;
- int err;
+ int ret;
- /* Must supply act, dump and init */
- if (!act->act || !act->dump || !act->init)
+ if (!act->act || !act->dump || !act->init || !act->walk || !act->lookup)
return -EINVAL;
- /* Supply defaults */
- if (!act->lookup)
- act->lookup = tcf_hash_search;
- if (!act->walk)
- act->walk = tcf_generic_walker;
-
- act->hinfo = kmalloc(sizeof(struct tcf_hashinfo), GFP_KERNEL);
- if (!act->hinfo)
- return -ENOMEM;
- err = tcf_hashinfo_init(act->hinfo, mask);
- if (err) {
- kfree(act->hinfo);
- return err;
- }
-
write_lock(&act_mod_lock);
list_for_each_entry(a, &act_base, head) {
if (act->type == a->type || (strcmp(act->kind, a->kind) == 0)) {
write_unlock(&act_mod_lock);
- tcf_hashinfo_destroy(act->hinfo);
- kfree(act->hinfo);
return -EEXIST;
}
}
list_add_tail(&act->head, &act_base);
write_unlock(&act_mod_lock);
+
+ ret = register_pernet_subsys(ops);
+ if (ret) {
+ tcf_unregister_action(act, ops);
+ return ret;
+ }
+
return 0;
}
EXPORT_SYMBOL(tcf_register_action);
-int tcf_unregister_action(struct tc_action_ops *act)
+int tcf_unregister_action(struct tc_action_ops *act,
+ struct pernet_operations *ops)
{
struct tc_action_ops *a;
int err = -ENOENT;
+ unregister_pernet_subsys(ops);
+
write_lock(&act_mod_lock);
list_for_each_entry(a, &act_base, head) {
if (a == act) {
list_del(&act->head);
- tcf_hashinfo_destroy(act->hinfo);
- kfree(act->hinfo);
err = 0;
break;
}
@@ -721,8 +749,8 @@ static struct tc_action *create_a(int i)
return act;
}
-static struct tc_action *
-tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid)
+static struct tc_action *tcf_action_get_1(struct net *net, struct nlattr *nla,
+ struct nlmsghdr *n, u32 portid)
{
struct nlattr *tb[TCA_ACT_MAX + 1];
struct tc_action *a;
@@ -749,7 +777,7 @@ tcf_action_get_1(struct nlattr *nla, struct nlmsghdr *n, u32 portid)
if (a->ops == NULL) /* could happen in batch of actions */
goto err_free;
err = -ENOENT;
- if (a->ops->lookup(a, index) == 0)
+ if (a->ops->lookup(net, a, index) == 0)
goto err_mod;
module_put(a->ops->owner);
@@ -819,7 +847,7 @@ static int tca_action_flush(struct net *net, struct nlattr *nla,
if (nest == NULL)
goto out_module_put;
- err = a.ops->walk(skb, &dcb, RTM_DELACTION, &a);
+ err = a.ops->walk(net, skb, &dcb, RTM_DELACTION, &a);
if (err < 0)
goto out_module_put;
if (err == 0)
@@ -897,7 +925,7 @@ tca_action_gd(struct net *net, struct nlattr *nla, struct nlmsghdr *n,
}
for (i = 1; i <= TCA_ACT_MAX_PRIO && tb[i]; i++) {
- act = tcf_action_get_1(tb[i], n, portid);
+ act = tcf_action_get_1(net, tb[i], n, portid);
if (IS_ERR(act)) {
ret = PTR_ERR(act);
goto err;
@@ -1044,6 +1072,7 @@ find_dump_kind(const struct nlmsghdr *n)
static int
tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
{
+ struct net *net = sock_net(skb->sk);
struct nlmsghdr *nlh;
unsigned char *b = skb_tail_pointer(skb);
struct nlattr *nest;
@@ -1078,7 +1107,7 @@ tc_dump_action(struct sk_buff *skb, struct netlink_callback *cb)
if (nest == NULL)
goto out_module_put;
- ret = a_o->walk(skb, cb, RTM_GETACTION, &a);
+ ret = a_o->walk(net, skb, cb, RTM_GETACTION, &a);
if (ret < 0)
goto out_module_put;
diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c
index 0bc6f912f870..8c9f1f0459ab 100644
--- a/net/sched/act_bpf.c
+++ b/net/sched/act_bpf.c
@@ -33,6 +33,8 @@ struct tcf_bpf_cfg {
bool is_ebpf;
};
+static int bpf_net_id;
+
static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
struct tcf_result *res)
{
@@ -275,6 +277,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *act,
int replace, int bind)
{
+ struct tc_action_net *tn = net_generic(net, bpf_net_id);
struct nlattr *tb[TCA_ACT_BPF_MAX + 1];
struct tcf_bpf_cfg cfg, old;
struct tc_act_bpf *parm;
@@ -294,8 +297,8 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_ACT_BPF_PARMS]);
- if (!tcf_hash_check(parm->index, act, bind)) {
- ret = tcf_hash_create(parm->index, est, act,
+ if (!tcf_hash_check(tn, parm->index, act, bind)) {
+ ret = tcf_hash_create(tn, parm->index, est, act,
sizeof(*prog), bind, true);
if (ret < 0)
return ret;
@@ -344,7 +347,7 @@ static int tcf_bpf_init(struct net *net, struct nlattr *nla,
rcu_assign_pointer(prog->filter, cfg.filter);
if (res == ACT_P_CREATED) {
- tcf_hash_insert(act);
+ tcf_hash_insert(tn, act);
} else {
/* make sure the program being replaced is no longer executing */
synchronize_rcu();
@@ -367,6 +370,22 @@ static void tcf_bpf_cleanup(struct tc_action *act, int bind)
tcf_bpf_cfg_cleanup(&tmp);
}
+static int tcf_bpf_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ struct tc_action *a)
+{
+ struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_bpf_search(struct net *net, struct tc_action *a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
static struct tc_action_ops act_bpf_ops __read_mostly = {
.kind = "bpf",
.type = TCA_ACT_BPF,
@@ -375,16 +394,39 @@ static struct tc_action_ops act_bpf_ops __read_mostly = {
.dump = tcf_bpf_dump,
.cleanup = tcf_bpf_cleanup,
.init = tcf_bpf_init,
+ .walk = tcf_bpf_walker,
+ .lookup = tcf_bpf_search,
+};
+
+static __net_init int bpf_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+ return tc_action_net_init(tn, &act_bpf_ops, BPF_TAB_MASK);
+}
+
+static void __net_exit bpf_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, bpf_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations bpf_net_ops = {
+ .init = bpf_init_net,
+ .exit = bpf_exit_net,
+ .id = &bpf_net_id,
+ .size = sizeof(struct tc_action_net),
};
static int __init bpf_init_module(void)
{
- return tcf_register_action(&act_bpf_ops, BPF_TAB_MASK);
+ return tcf_register_action(&act_bpf_ops, &bpf_net_ops);
}
static void __exit bpf_cleanup_module(void)
{
- tcf_unregister_action(&act_bpf_ops);
+ tcf_unregister_action(&act_bpf_ops, &bpf_net_ops);
}
module_init(bpf_init_module);
diff --git a/net/sched/act_connmark.c b/net/sched/act_connmark.c
index bb41699c6c49..c0ed93ce2391 100644
--- a/net/sched/act_connmark.c
+++ b/net/sched/act_connmark.c
@@ -30,6 +30,8 @@
#define CONNMARK_TAB_MASK 3
+static int connmark_net_id;
+
static int tcf_connmark(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
@@ -97,6 +99,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a,
int ovr, int bind)
{
+ struct tc_action_net *tn = net_generic(net, connmark_net_id);
struct nlattr *tb[TCA_CONNMARK_MAX + 1];
struct tcf_connmark_info *ci;
struct tc_connmark *parm;
@@ -111,9 +114,9 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_CONNMARK_PARMS]);
- if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*ci),
- bind, false);
+ if (!tcf_hash_check(tn, parm->index, a, bind)) {
+ ret = tcf_hash_create(tn, parm->index, est, a,
+ sizeof(*ci), bind, false);
if (ret)
return ret;
@@ -122,7 +125,7 @@ static int tcf_connmark_init(struct net *net, struct nlattr *nla,
ci->net = net;
ci->zone = parm->zone;
- tcf_hash_insert(a);
+ tcf_hash_insert(tn, a);
ret = ACT_P_CREATED;
} else {
ci = to_connmark(a);
@@ -169,6 +172,22 @@ nla_put_failure:
return -1;
}
+static int tcf_connmark_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ struct tc_action *a)
+{
+ struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_connmark_search(struct net *net, struct tc_action *a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
static struct tc_action_ops act_connmark_ops = {
.kind = "connmark",
.type = TCA_ACT_CONNMARK,
@@ -176,16 +195,39 @@ static struct tc_action_ops act_connmark_ops = {
.act = tcf_connmark,
.dump = tcf_connmark_dump,
.init = tcf_connmark_init,
+ .walk = tcf_connmark_walker,
+ .lookup = tcf_connmark_search,
+};
+
+static __net_init int connmark_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+ return tc_action_net_init(tn, &act_connmark_ops, CONNMARK_TAB_MASK);
+}
+
+static void __net_exit connmark_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, connmark_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations connmark_net_ops = {
+ .init = connmark_init_net,
+ .exit = connmark_exit_net,
+ .id = &connmark_net_id,
+ .size = sizeof(struct tc_action_net),
};
static int __init connmark_init_module(void)
{
- return tcf_register_action(&act_connmark_ops, CONNMARK_TAB_MASK);
+ return tcf_register_action(&act_connmark_ops, &connmark_net_ops);
}
static void __exit connmark_cleanup_module(void)
{
- tcf_unregister_action(&act_connmark_ops);
+ tcf_unregister_action(&act_connmark_ops, &connmark_net_ops);
}
module_init(connmark_init_module);
diff --git a/net/sched/act_csum.c b/net/sched/act_csum.c
index b07c535ba8e7..d22426cdebc0 100644
--- a/net/sched/act_csum.c
+++ b/net/sched/act_csum.c
@@ -42,9 +42,13 @@ static const struct nla_policy csum_policy[TCA_CSUM_MAX + 1] = {
[TCA_CSUM_PARMS] = { .len = sizeof(struct tc_csum), },
};
-static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
- struct tc_action *a, int ovr, int bind)
+static int csum_net_id;
+
+static int tcf_csum_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a, int ovr,
+ int bind)
{
+ struct tc_action_net *tn = net_generic(net, csum_net_id);
struct nlattr *tb[TCA_CSUM_MAX + 1];
struct tc_csum *parm;
struct tcf_csum *p;
@@ -61,9 +65,9 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
return -EINVAL;
parm = nla_data(tb[TCA_CSUM_PARMS]);
- if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
- bind, false);
+ if (!tcf_hash_check(tn, parm->index, a, bind)) {
+ ret = tcf_hash_create(tn, parm->index, est, a,
+ sizeof(*p), bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
@@ -82,7 +86,7 @@ static int tcf_csum_init(struct net *n, struct nlattr *nla, struct nlattr *est,
spin_unlock_bh(&p->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(a);
+ tcf_hash_insert(tn, a);
return ret;
}
@@ -105,9 +109,7 @@ static void *tcf_csum_skb_nextlayer(struct sk_buff *skb,
int hl = ihl + jhl;
if (!pskb_may_pull(skb, ipl + ntkoff) || (ipl < hl) ||
- (skb_cloned(skb) &&
- !skb_clone_writable(skb, hl + ntkoff) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ skb_try_make_writable(skb, hl + ntkoff))
return NULL;
else
return (void *)(skb_network_header(skb) + ihl);
@@ -365,9 +367,7 @@ static int tcf_csum_ipv4(struct sk_buff *skb, u32 update_flags)
}
if (update_flags & TCA_CSUM_UPDATE_FLAG_IPV4HDR) {
- if (skb_cloned(skb) &&
- !skb_clone_writable(skb, sizeof(*iph) + ntkoff) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_try_make_writable(skb, sizeof(*iph) + ntkoff))
goto fail;
ip_send_check(ip_hdr(skb));
@@ -559,6 +559,22 @@ nla_put_failure:
return -1;
}
+static int tcf_csum_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ struct tc_action *a)
+{
+ struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_csum_search(struct net *net, struct tc_action *a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
static struct tc_action_ops act_csum_ops = {
.kind = "csum",
.type = TCA_ACT_CSUM,
@@ -566,6 +582,29 @@ static struct tc_action_ops act_csum_ops = {
.act = tcf_csum,
.dump = tcf_csum_dump,
.init = tcf_csum_init,
+ .walk = tcf_csum_walker,
+ .lookup = tcf_csum_search,
+};
+
+static __net_init int csum_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+ return tc_action_net_init(tn, &act_csum_ops, CSUM_TAB_MASK);
+}
+
+static void __net_exit csum_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, csum_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations csum_net_ops = {
+ .init = csum_init_net,
+ .exit = csum_exit_net,
+ .id = &csum_net_id,
+ .size = sizeof(struct tc_action_net),
};
MODULE_DESCRIPTION("Checksum updating actions");
@@ -573,12 +612,12 @@ MODULE_LICENSE("GPL");
static int __init csum_init_module(void)
{
- return tcf_register_action(&act_csum_ops, CSUM_TAB_MASK);
+ return tcf_register_action(&act_csum_ops, &csum_net_ops);
}
static void __exit csum_cleanup_module(void)
{
- tcf_unregister_action(&act_csum_ops);
+ tcf_unregister_action(&act_csum_ops, &csum_net_ops);
}
module_init(csum_init_module);
diff --git a/net/sched/act_gact.c b/net/sched/act_gact.c
index 5c1b05170736..887fc1f209ff 100644
--- a/net/sched/act_gact.c
+++ b/net/sched/act_gact.c
@@ -25,6 +25,8 @@
#define GACT_TAB_MASK 15
+static int gact_net_id;
+
#ifdef CONFIG_GACT_PROB
static int gact_net_rand(struct tcf_gact *gact)
{
@@ -57,6 +59,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a,
int ovr, int bind)
{
+ struct tc_action_net *tn = net_generic(net, gact_net_id);
struct nlattr *tb[TCA_GACT_MAX + 1];
struct tc_gact *parm;
struct tcf_gact *gact;
@@ -88,9 +91,9 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
}
#endif
- if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*gact),
- bind, true);
+ if (!tcf_hash_check(tn, parm->index, a, bind)) {
+ ret = tcf_hash_create(tn, parm->index, est, a,
+ sizeof(*gact), bind, true);
if (ret)
return ret;
ret = ACT_P_CREATED;
@@ -118,7 +121,7 @@ static int tcf_gact_init(struct net *net, struct nlattr *nla,
}
#endif
if (ret == ACT_P_CREATED)
- tcf_hash_insert(a);
+ tcf_hash_insert(tn, a);
return ret;
}
@@ -183,6 +186,22 @@ nla_put_failure:
return -1;
}
+static int tcf_gact_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ struct tc_action *a)
+{
+ struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_gact_search(struct net *net, struct tc_action *a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
static struct tc_action_ops act_gact_ops = {
.kind = "gact",
.type = TCA_ACT_GACT,
@@ -190,6 +209,29 @@ static struct tc_action_ops act_gact_ops = {
.act = tcf_gact,
.dump = tcf_gact_dump,
.init = tcf_gact_init,
+ .walk = tcf_gact_walker,
+ .lookup = tcf_gact_search,
+};
+
+static __net_init int gact_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+ return tc_action_net_init(tn, &act_gact_ops, GACT_TAB_MASK);
+}
+
+static void __net_exit gact_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, gact_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations gact_net_ops = {
+ .init = gact_init_net,
+ .exit = gact_exit_net,
+ .id = &gact_net_id,
+ .size = sizeof(struct tc_action_net),
};
MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
@@ -203,12 +245,13 @@ static int __init gact_init_module(void)
#else
pr_info("GACT probability NOT on\n");
#endif
- return tcf_register_action(&act_gact_ops, GACT_TAB_MASK);
+
+ return tcf_register_action(&act_gact_ops, &gact_net_ops);
}
static void __exit gact_cleanup_module(void)
{
- tcf_unregister_action(&act_gact_ops);
+ tcf_unregister_action(&act_gact_ops, &gact_net_ops);
}
module_init(gact_init_module);
diff --git a/net/sched/act_ife.c b/net/sched/act_ife.c
new file mode 100644
index 000000000000..c589a9ba506a
--- /dev/null
+++ b/net/sched/act_ife.c
@@ -0,0 +1,870 @@
+/*
+ * net/sched/ife.c Inter-FE action based on ForCES WG InterFE LFB
+ *
+ * Refer to:
+ * draft-ietf-forces-interfelfb-03
+ * and
+ * netdev01 paper:
+ * "Distributing Linux Traffic Control Classifier-Action
+ * Subsystem"
+ * Authors: Jamal Hadi Salim and Damascene M. Joachimpillai
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2015)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/net_namespace.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+#include <linux/etherdevice.h>
+
+#define IFE_TAB_MASK 15
+
+static int ife_net_id;
+static int max_metacnt = IFE_META_MAX + 1;
+
+static const struct nla_policy ife_policy[TCA_IFE_MAX + 1] = {
+ [TCA_IFE_PARMS] = { .len = sizeof(struct tc_ife)},
+ [TCA_IFE_DMAC] = { .len = ETH_ALEN},
+ [TCA_IFE_SMAC] = { .len = ETH_ALEN},
+ [TCA_IFE_TYPE] = { .type = NLA_U16},
+};
+
+/* Caller takes care of presenting data in network order
+*/
+int ife_tlv_meta_encode(void *skbdata, u16 attrtype, u16 dlen, const void *dval)
+{
+ u32 *tlv = (u32 *)(skbdata);
+ u16 totlen = nla_total_size(dlen); /*alignment + hdr */
+ char *dptr = (char *)tlv + NLA_HDRLEN;
+ u32 htlv = attrtype << 16 | totlen;
+
+ *tlv = htonl(htlv);
+ memset(dptr, 0, totlen - NLA_HDRLEN);
+ memcpy(dptr, dval, dlen);
+
+ return totlen;
+}
+EXPORT_SYMBOL_GPL(ife_tlv_meta_encode);
+
+int ife_get_meta_u32(struct sk_buff *skb, struct tcf_meta_info *mi)
+{
+ if (mi->metaval)
+ return nla_put_u32(skb, mi->metaid, *(u32 *)mi->metaval);
+ else
+ return nla_put(skb, mi->metaid, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(ife_get_meta_u32);
+
+int ife_check_meta_u32(u32 metaval, struct tcf_meta_info *mi)
+{
+ if (metaval || mi->metaval)
+ return 8; /* T+L+V == 2+2+4 */
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ife_check_meta_u32);
+
+int ife_encode_meta_u32(u32 metaval, void *skbdata, struct tcf_meta_info *mi)
+{
+ u32 edata = metaval;
+
+ if (mi->metaval)
+ edata = *(u32 *)mi->metaval;
+ else if (metaval)
+ edata = metaval;
+
+ if (!edata) /* will not encode */
+ return 0;
+
+ edata = htonl(edata);
+ return ife_tlv_meta_encode(skbdata, mi->metaid, 4, &edata);
+}
+EXPORT_SYMBOL_GPL(ife_encode_meta_u32);
+
+int ife_get_meta_u16(struct sk_buff *skb, struct tcf_meta_info *mi)
+{
+ if (mi->metaval)
+ return nla_put_u16(skb, mi->metaid, *(u16 *)mi->metaval);
+ else
+ return nla_put(skb, mi->metaid, 0, NULL);
+}
+EXPORT_SYMBOL_GPL(ife_get_meta_u16);
+
+int ife_alloc_meta_u32(struct tcf_meta_info *mi, void *metaval)
+{
+ mi->metaval = kmemdup(metaval, sizeof(u32), GFP_KERNEL);
+ if (!mi->metaval)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ife_alloc_meta_u32);
+
+int ife_alloc_meta_u16(struct tcf_meta_info *mi, void *metaval)
+{
+ mi->metaval = kmemdup(metaval, sizeof(u16), GFP_KERNEL);
+ if (!mi->metaval)
+ return -ENOMEM;
+
+ return 0;
+}
+EXPORT_SYMBOL_GPL(ife_alloc_meta_u16);
+
+void ife_release_meta_gen(struct tcf_meta_info *mi)
+{
+ kfree(mi->metaval);
+}
+EXPORT_SYMBOL_GPL(ife_release_meta_gen);
+
+int ife_validate_meta_u32(void *val, int len)
+{
+ if (len == 4)
+ return 0;
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ife_validate_meta_u32);
+
+int ife_validate_meta_u16(void *val, int len)
+{
+ /* length will include padding */
+ if (len == NLA_ALIGN(2))
+ return 0;
+
+ return -EINVAL;
+}
+EXPORT_SYMBOL_GPL(ife_validate_meta_u16);
+
+static LIST_HEAD(ifeoplist);
+static DEFINE_RWLOCK(ife_mod_lock);
+
+static struct tcf_meta_ops *find_ife_oplist(u16 metaid)
+{
+ struct tcf_meta_ops *o;
+
+ read_lock(&ife_mod_lock);
+ list_for_each_entry(o, &ifeoplist, list) {
+ if (o->metaid == metaid) {
+ if (!try_module_get(o->owner))
+ o = NULL;
+ read_unlock(&ife_mod_lock);
+ return o;
+ }
+ }
+ read_unlock(&ife_mod_lock);
+
+ return NULL;
+}
+
+int register_ife_op(struct tcf_meta_ops *mops)
+{
+ struct tcf_meta_ops *m;
+
+ if (!mops->metaid || !mops->metatype || !mops->name ||
+ !mops->check_presence || !mops->encode || !mops->decode ||
+ !mops->get || !mops->alloc)
+ return -EINVAL;
+
+ write_lock(&ife_mod_lock);
+
+ list_for_each_entry(m, &ifeoplist, list) {
+ if (m->metaid == mops->metaid ||
+ (strcmp(mops->name, m->name) == 0)) {
+ write_unlock(&ife_mod_lock);
+ return -EEXIST;
+ }
+ }
+
+ if (!mops->release)
+ mops->release = ife_release_meta_gen;
+
+ list_add_tail(&mops->list, &ifeoplist);
+ write_unlock(&ife_mod_lock);
+ return 0;
+}
+EXPORT_SYMBOL_GPL(unregister_ife_op);
+
+int unregister_ife_op(struct tcf_meta_ops *mops)
+{
+ struct tcf_meta_ops *m;
+ int err = -ENOENT;
+
+ write_lock(&ife_mod_lock);
+ list_for_each_entry(m, &ifeoplist, list) {
+ if (m->metaid == mops->metaid) {
+ list_del(&mops->list);
+ err = 0;
+ break;
+ }
+ }
+ write_unlock(&ife_mod_lock);
+
+ return err;
+}
+EXPORT_SYMBOL_GPL(register_ife_op);
+
+static int ife_validate_metatype(struct tcf_meta_ops *ops, void *val, int len)
+{
+ int ret = 0;
+ /* XXX: unfortunately cant use nla_policy at this point
+ * because a length of 0 is valid in the case of
+ * "allow". "use" semantics do enforce for proper
+ * length and i couldve use nla_policy but it makes it hard
+ * to use it just for that..
+ */
+ if (ops->validate)
+ return ops->validate(val, len);
+
+ if (ops->metatype == NLA_U32)
+ ret = ife_validate_meta_u32(val, len);
+ else if (ops->metatype == NLA_U16)
+ ret = ife_validate_meta_u16(val, len);
+
+ return ret;
+}
+
+/* called when adding new meta information
+ * under ife->tcf_lock
+*/
+static int load_metaops_and_vet(struct tcf_ife_info *ife, u32 metaid,
+ void *val, int len)
+{
+ struct tcf_meta_ops *ops = find_ife_oplist(metaid);
+ int ret = 0;
+
+ if (!ops) {
+ ret = -ENOENT;
+#ifdef CONFIG_MODULES
+ spin_unlock_bh(&ife->tcf_lock);
+ rtnl_unlock();
+ request_module("ifemeta%u", metaid);
+ rtnl_lock();
+ spin_lock_bh(&ife->tcf_lock);
+ ops = find_ife_oplist(metaid);
+#endif
+ }
+
+ if (ops) {
+ ret = 0;
+ if (len)
+ ret = ife_validate_metatype(ops, val, len);
+
+ module_put(ops->owner);
+ }
+
+ return ret;
+}
+
+/* called when adding new meta information
+ * under ife->tcf_lock
+*/
+static int add_metainfo(struct tcf_ife_info *ife, u32 metaid, void *metaval,
+ int len)
+{
+ struct tcf_meta_info *mi = NULL;
+ struct tcf_meta_ops *ops = find_ife_oplist(metaid);
+ int ret = 0;
+
+ if (!ops)
+ return -ENOENT;
+
+ mi = kzalloc(sizeof(*mi), GFP_KERNEL);
+ if (!mi) {
+ /*put back what find_ife_oplist took */
+ module_put(ops->owner);
+ return -ENOMEM;
+ }
+
+ mi->metaid = metaid;
+ mi->ops = ops;
+ if (len > 0) {
+ ret = ops->alloc(mi, metaval);
+ if (ret != 0) {
+ kfree(mi);
+ module_put(ops->owner);
+ return ret;
+ }
+ }
+
+ list_add_tail(&mi->metalist, &ife->metalist);
+
+ return ret;
+}
+
+static int use_all_metadata(struct tcf_ife_info *ife)
+{
+ struct tcf_meta_ops *o;
+ int rc = 0;
+ int installed = 0;
+
+ list_for_each_entry(o, &ifeoplist, list) {
+ rc = add_metainfo(ife, o->metaid, NULL, 0);
+ if (rc == 0)
+ installed += 1;
+ }
+
+ if (installed)
+ return 0;
+ else
+ return -EINVAL;
+}
+
+static int dump_metalist(struct sk_buff *skb, struct tcf_ife_info *ife)
+{
+ struct tcf_meta_info *e;
+ struct nlattr *nest;
+ unsigned char *b = skb_tail_pointer(skb);
+ int total_encoded = 0;
+
+ /*can only happen on decode */
+ if (list_empty(&ife->metalist))
+ return 0;
+
+ nest = nla_nest_start(skb, TCA_IFE_METALST);
+ if (!nest)
+ goto out_nlmsg_trim;
+
+ list_for_each_entry(e, &ife->metalist, metalist) {
+ if (!e->ops->get(skb, e))
+ total_encoded += 1;
+ }
+
+ if (!total_encoded)
+ goto out_nlmsg_trim;
+
+ nla_nest_end(skb, nest);
+
+ return 0;
+
+out_nlmsg_trim:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+/* under ife->tcf_lock */
+static void _tcf_ife_cleanup(struct tc_action *a, int bind)
+{
+ struct tcf_ife_info *ife = a->priv;
+ struct tcf_meta_info *e, *n;
+
+ list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+ module_put(e->ops->owner);
+ list_del(&e->metalist);
+ if (e->metaval) {
+ if (e->ops->release)
+ e->ops->release(e);
+ else
+ kfree(e->metaval);
+ }
+ kfree(e);
+ }
+}
+
+static void tcf_ife_cleanup(struct tc_action *a, int bind)
+{
+ struct tcf_ife_info *ife = a->priv;
+
+ spin_lock_bh(&ife->tcf_lock);
+ _tcf_ife_cleanup(a, bind);
+ spin_unlock_bh(&ife->tcf_lock);
+}
+
+/* under ife->tcf_lock */
+static int populate_metalist(struct tcf_ife_info *ife, struct nlattr **tb)
+{
+ int len = 0;
+ int rc = 0;
+ int i = 0;
+ void *val;
+
+ for (i = 1; i < max_metacnt; i++) {
+ if (tb[i]) {
+ val = nla_data(tb[i]);
+ len = nla_len(tb[i]);
+
+ rc = load_metaops_and_vet(ife, i, val, len);
+ if (rc != 0)
+ return rc;
+
+ rc = add_metainfo(ife, i, val, len);
+ if (rc)
+ return rc;
+ }
+ }
+
+ return rc;
+}
+
+static int tcf_ife_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a,
+ int ovr, int bind)
+{
+ struct tc_action_net *tn = net_generic(net, ife_net_id);
+ struct nlattr *tb[TCA_IFE_MAX + 1];
+ struct nlattr *tb2[IFE_META_MAX + 1];
+ struct tcf_ife_info *ife;
+ struct tc_ife *parm;
+ u16 ife_type = 0;
+ u8 *daddr = NULL;
+ u8 *saddr = NULL;
+ int ret = 0;
+ int err;
+
+ err = nla_parse_nested(tb, TCA_IFE_MAX, nla, ife_policy);
+ if (err < 0)
+ return err;
+
+ if (!tb[TCA_IFE_PARMS])
+ return -EINVAL;
+
+ parm = nla_data(tb[TCA_IFE_PARMS]);
+
+ if (parm->flags & IFE_ENCODE) {
+ /* Until we get issued the ethertype, we cant have
+ * a default..
+ **/
+ if (!tb[TCA_IFE_TYPE]) {
+ pr_info("You MUST pass etherype for encoding\n");
+ return -EINVAL;
+ }
+ }
+
+ if (!tcf_hash_check(tn, parm->index, a, bind)) {
+ ret = tcf_hash_create(tn, parm->index, est, a, sizeof(*ife),
+ bind, false);
+ if (ret)
+ return ret;
+ ret = ACT_P_CREATED;
+ } else {
+ if (bind) /* dont override defaults */
+ return 0;
+ tcf_hash_release(a, bind);
+ if (!ovr)
+ return -EEXIST;
+ }
+
+ ife = to_ife(a);
+ ife->flags = parm->flags;
+
+ if (parm->flags & IFE_ENCODE) {
+ ife_type = nla_get_u16(tb[TCA_IFE_TYPE]);
+ if (tb[TCA_IFE_DMAC])
+ daddr = nla_data(tb[TCA_IFE_DMAC]);
+ if (tb[TCA_IFE_SMAC])
+ saddr = nla_data(tb[TCA_IFE_SMAC]);
+ }
+
+ spin_lock_bh(&ife->tcf_lock);
+ ife->tcf_action = parm->action;
+
+ if (parm->flags & IFE_ENCODE) {
+ if (daddr)
+ ether_addr_copy(ife->eth_dst, daddr);
+ else
+ eth_zero_addr(ife->eth_dst);
+
+ if (saddr)
+ ether_addr_copy(ife->eth_src, saddr);
+ else
+ eth_zero_addr(ife->eth_src);
+
+ ife->eth_type = ife_type;
+ }
+
+ if (ret == ACT_P_CREATED)
+ INIT_LIST_HEAD(&ife->metalist);
+
+ if (tb[TCA_IFE_METALST]) {
+ err = nla_parse_nested(tb2, IFE_META_MAX, tb[TCA_IFE_METALST],
+ NULL);
+ if (err) {
+metadata_parse_err:
+ if (ret == ACT_P_CREATED)
+ _tcf_ife_cleanup(a, bind);
+
+ spin_unlock_bh(&ife->tcf_lock);
+ return err;
+ }
+
+ err = populate_metalist(ife, tb2);
+ if (err)
+ goto metadata_parse_err;
+
+ } else {
+ /* if no passed metadata allow list or passed allow-all
+ * then here we process by adding as many supported metadatum
+ * as we can. You better have at least one else we are
+ * going to bail out
+ */
+ err = use_all_metadata(ife);
+ if (err) {
+ if (ret == ACT_P_CREATED)
+ _tcf_ife_cleanup(a, bind);
+
+ spin_unlock_bh(&ife->tcf_lock);
+ return err;
+ }
+ }
+
+ spin_unlock_bh(&ife->tcf_lock);
+
+ if (ret == ACT_P_CREATED)
+ tcf_hash_insert(tn, a);
+
+ return ret;
+}
+
+static int tcf_ife_dump(struct sk_buff *skb, struct tc_action *a, int bind,
+ int ref)
+{
+ unsigned char *b = skb_tail_pointer(skb);
+ struct tcf_ife_info *ife = a->priv;
+ struct tc_ife opt = {
+ .index = ife->tcf_index,
+ .refcnt = ife->tcf_refcnt - ref,
+ .bindcnt = ife->tcf_bindcnt - bind,
+ .action = ife->tcf_action,
+ .flags = ife->flags,
+ };
+ struct tcf_t t;
+
+ if (nla_put(skb, TCA_IFE_PARMS, sizeof(opt), &opt))
+ goto nla_put_failure;
+
+ t.install = jiffies_to_clock_t(jiffies - ife->tcf_tm.install);
+ t.lastuse = jiffies_to_clock_t(jiffies - ife->tcf_tm.lastuse);
+ t.expires = jiffies_to_clock_t(ife->tcf_tm.expires);
+ if (nla_put(skb, TCA_IFE_TM, sizeof(t), &t))
+ goto nla_put_failure;
+
+ if (!is_zero_ether_addr(ife->eth_dst)) {
+ if (nla_put(skb, TCA_IFE_DMAC, ETH_ALEN, ife->eth_dst))
+ goto nla_put_failure;
+ }
+
+ if (!is_zero_ether_addr(ife->eth_src)) {
+ if (nla_put(skb, TCA_IFE_SMAC, ETH_ALEN, ife->eth_src))
+ goto nla_put_failure;
+ }
+
+ if (nla_put(skb, TCA_IFE_TYPE, 2, &ife->eth_type))
+ goto nla_put_failure;
+
+ if (dump_metalist(skb, ife)) {
+ /*ignore failure to dump metalist */
+ pr_info("Failed to dump metalist\n");
+ }
+
+ return skb->len;
+
+nla_put_failure:
+ nlmsg_trim(skb, b);
+ return -1;
+}
+
+int find_decode_metaid(struct sk_buff *skb, struct tcf_ife_info *ife,
+ u16 metaid, u16 mlen, void *mdata)
+{
+ struct tcf_meta_info *e;
+
+ /* XXX: use hash to speed up */
+ list_for_each_entry(e, &ife->metalist, metalist) {
+ if (metaid == e->metaid) {
+ if (e->ops) {
+ /* We check for decode presence already */
+ return e->ops->decode(skb, mdata, mlen);
+ }
+ }
+ }
+
+ return 0;
+}
+
+struct ifeheadr {
+ __be16 metalen;
+ u8 tlv_data[];
+};
+
+struct meta_tlvhdr {
+ __be16 type;
+ __be16 len;
+};
+
+static int tcf_ife_decode(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_ife_info *ife = a->priv;
+ int action = ife->tcf_action;
+ struct ifeheadr *ifehdr = (struct ifeheadr *)skb->data;
+ u16 ifehdrln = ifehdr->metalen;
+ struct meta_tlvhdr *tlv = (struct meta_tlvhdr *)(ifehdr->tlv_data);
+
+ spin_lock(&ife->tcf_lock);
+ bstats_update(&ife->tcf_bstats, skb);
+ ife->tcf_tm.lastuse = jiffies;
+ spin_unlock(&ife->tcf_lock);
+
+ ifehdrln = ntohs(ifehdrln);
+ if (unlikely(!pskb_may_pull(skb, ifehdrln))) {
+ spin_lock(&ife->tcf_lock);
+ ife->tcf_qstats.drops++;
+ spin_unlock(&ife->tcf_lock);
+ return TC_ACT_SHOT;
+ }
+
+ skb_set_mac_header(skb, ifehdrln);
+ __skb_pull(skb, ifehdrln);
+ skb->protocol = eth_type_trans(skb, skb->dev);
+ ifehdrln -= IFE_METAHDRLEN;
+
+ while (ifehdrln > 0) {
+ u8 *tlvdata = (u8 *)tlv;
+ u16 mtype = tlv->type;
+ u16 mlen = tlv->len;
+
+ mtype = ntohs(mtype);
+ mlen = ntohs(mlen);
+
+ if (find_decode_metaid(skb, ife, mtype, (mlen - 4),
+ (void *)(tlvdata + 4))) {
+ /* abuse overlimits to count when we receive metadata
+ * but dont have an ops for it
+ */
+ pr_info_ratelimited("Unknown metaid %d alnlen %d\n",
+ mtype, mlen);
+ ife->tcf_qstats.overlimits++;
+ }
+
+ tlvdata += mlen;
+ ifehdrln -= mlen;
+ tlv = (struct meta_tlvhdr *)tlvdata;
+ }
+
+ skb_reset_network_header(skb);
+ return action;
+}
+
+/*XXX: check if we can do this at install time instead of current
+ * send data path
+**/
+static int ife_get_sz(struct sk_buff *skb, struct tcf_ife_info *ife)
+{
+ struct tcf_meta_info *e, *n;
+ int tot_run_sz = 0, run_sz = 0;
+
+ list_for_each_entry_safe(e, n, &ife->metalist, metalist) {
+ if (e->ops->check_presence) {
+ run_sz = e->ops->check_presence(skb, e);
+ tot_run_sz += run_sz;
+ }
+ }
+
+ return tot_run_sz;
+}
+
+static int tcf_ife_encode(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_ife_info *ife = a->priv;
+ int action = ife->tcf_action;
+ struct ethhdr *oethh; /* outer ether header */
+ struct ethhdr *iethh; /* inner eth header */
+ struct tcf_meta_info *e;
+ /*
+ OUTERHDR:TOTMETALEN:{TLVHDR:Metadatum:TLVHDR..}:ORIGDATA
+ where ORIGDATA = original ethernet header ...
+ */
+ u16 metalen = ife_get_sz(skb, ife);
+ int hdrm = metalen + skb->dev->hard_header_len + IFE_METAHDRLEN;
+ unsigned int skboff = skb->dev->hard_header_len;
+ u32 at = G_TC_AT(skb->tc_verd);
+ int new_len = skb->len + hdrm;
+ bool exceed_mtu = false;
+ int err;
+
+ if (at & AT_EGRESS) {
+ if (new_len > skb->dev->mtu)
+ exceed_mtu = true;
+ }
+
+ spin_lock(&ife->tcf_lock);
+ bstats_update(&ife->tcf_bstats, skb);
+ ife->tcf_tm.lastuse = jiffies;
+
+ if (!metalen) { /* no metadata to send */
+ /* abuse overlimits to count when we allow packet
+ * with no metadata
+ */
+ ife->tcf_qstats.overlimits++;
+ spin_unlock(&ife->tcf_lock);
+ return action;
+ }
+ /* could be stupid policy setup or mtu config
+ * so lets be conservative.. */
+ if ((action == TC_ACT_SHOT) || exceed_mtu) {
+ ife->tcf_qstats.drops++;
+ spin_unlock(&ife->tcf_lock);
+ return TC_ACT_SHOT;
+ }
+
+ iethh = eth_hdr(skb);
+
+ err = skb_cow_head(skb, hdrm);
+ if (unlikely(err)) {
+ ife->tcf_qstats.drops++;
+ spin_unlock(&ife->tcf_lock);
+ return TC_ACT_SHOT;
+ }
+
+ if (!(at & AT_EGRESS))
+ skb_push(skb, skb->dev->hard_header_len);
+
+ __skb_push(skb, hdrm);
+ memcpy(skb->data, iethh, skb->mac_len);
+ skb_reset_mac_header(skb);
+ oethh = eth_hdr(skb);
+
+ /*total metadata length */
+ metalen += IFE_METAHDRLEN;
+ metalen = htons(metalen);
+ memcpy((skb->data + skboff), &metalen, IFE_METAHDRLEN);
+ skboff += IFE_METAHDRLEN;
+
+ /* XXX: we dont have a clever way of telling encode to
+ * not repeat some of the computations that are done by
+ * ops->presence_check...
+ */
+ list_for_each_entry(e, &ife->metalist, metalist) {
+ if (e->ops->encode) {
+ err = e->ops->encode(skb, (void *)(skb->data + skboff),
+ e);
+ }
+ if (err < 0) {
+ /* too corrupt to keep around if overwritten */
+ ife->tcf_qstats.drops++;
+ spin_unlock(&ife->tcf_lock);
+ return TC_ACT_SHOT;
+ }
+ skboff += err;
+ }
+
+ if (!is_zero_ether_addr(ife->eth_src))
+ ether_addr_copy(oethh->h_source, ife->eth_src);
+ else
+ ether_addr_copy(oethh->h_source, iethh->h_source);
+ if (!is_zero_ether_addr(ife->eth_dst))
+ ether_addr_copy(oethh->h_dest, ife->eth_dst);
+ else
+ ether_addr_copy(oethh->h_dest, iethh->h_dest);
+ oethh->h_proto = htons(ife->eth_type);
+
+ if (!(at & AT_EGRESS))
+ skb_pull(skb, skb->dev->hard_header_len);
+
+ spin_unlock(&ife->tcf_lock);
+
+ return action;
+}
+
+static int tcf_ife_act(struct sk_buff *skb, const struct tc_action *a,
+ struct tcf_result *res)
+{
+ struct tcf_ife_info *ife = a->priv;
+
+ if (ife->flags & IFE_ENCODE)
+ return tcf_ife_encode(skb, a, res);
+
+ if (!(ife->flags & IFE_ENCODE))
+ return tcf_ife_decode(skb, a, res);
+
+ pr_info_ratelimited("unknown failure(policy neither de/encode\n");
+ spin_lock(&ife->tcf_lock);
+ bstats_update(&ife->tcf_bstats, skb);
+ ife->tcf_tm.lastuse = jiffies;
+ ife->tcf_qstats.drops++;
+ spin_unlock(&ife->tcf_lock);
+
+ return TC_ACT_SHOT;
+}
+
+static int tcf_ife_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ struct tc_action *a)
+{
+ struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_ife_search(struct net *net, struct tc_action *a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
+static struct tc_action_ops act_ife_ops = {
+ .kind = "ife",
+ .type = TCA_ACT_IFE,
+ .owner = THIS_MODULE,
+ .act = tcf_ife_act,
+ .dump = tcf_ife_dump,
+ .cleanup = tcf_ife_cleanup,
+ .init = tcf_ife_init,
+ .walk = tcf_ife_walker,
+ .lookup = tcf_ife_search,
+};
+
+static __net_init int ife_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+ return tc_action_net_init(tn, &act_ife_ops, IFE_TAB_MASK);
+}
+
+static void __net_exit ife_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, ife_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations ife_net_ops = {
+ .init = ife_init_net,
+ .exit = ife_exit_net,
+ .id = &ife_net_id,
+ .size = sizeof(struct tc_action_net),
+};
+
+static int __init ife_init_module(void)
+{
+ return tcf_register_action(&act_ife_ops, &ife_net_ops);
+}
+
+static void __exit ife_cleanup_module(void)
+{
+ tcf_unregister_action(&act_ife_ops, &ife_net_ops);
+}
+
+module_init(ife_init_module);
+module_exit(ife_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2015)");
+MODULE_DESCRIPTION("Inter-FE LFB action");
+MODULE_LICENSE("GPL");
diff --git a/net/sched/act_ipt.c b/net/sched/act_ipt.c
index d05869646515..350e134cffb3 100644
--- a/net/sched/act_ipt.c
+++ b/net/sched/act_ipt.c
@@ -30,6 +30,10 @@
#define IPT_TAB_MASK 15
+static int ipt_net_id;
+
+static int xt_net_id;
+
static int ipt_init_target(struct xt_entry_target *t, char *table, unsigned int hook)
{
struct xt_tgchk_param par;
@@ -62,6 +66,7 @@ static void ipt_destroy_target(struct xt_entry_target *t)
struct xt_tgdtor_param par = {
.target = t->u.kernel.target,
.targinfo = t->data,
+ .family = NFPROTO_IPV4,
};
if (par.target->destroy != NULL)
par.target->destroy(&par);
@@ -83,8 +88,9 @@ static const struct nla_policy ipt_policy[TCA_IPT_MAX + 1] = {
[TCA_IPT_TARG] = { .len = sizeof(struct xt_entry_target) },
};
-static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
- struct tc_action *a, int ovr, int bind)
+static int __tcf_ipt_init(struct tc_action_net *tn, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a, int ovr,
+ int bind)
{
struct nlattr *tb[TCA_IPT_MAX + 1];
struct tcf_ipt *ipt;
@@ -113,8 +119,9 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
if (tb[TCA_IPT_INDEX] != NULL)
index = nla_get_u32(tb[TCA_IPT_INDEX]);
- if (!tcf_hash_check(index, a, bind) ) {
- ret = tcf_hash_create(index, est, a, sizeof(*ipt), bind, false);
+ if (!tcf_hash_check(tn, index, a, bind)) {
+ ret = tcf_hash_create(tn, index, est, a, sizeof(*ipt), bind,
+ false);
if (ret)
return ret;
ret = ACT_P_CREATED;
@@ -157,7 +164,7 @@ static int tcf_ipt_init(struct net *net, struct nlattr *nla, struct nlattr *est,
ipt->tcfi_hook = hook;
spin_unlock_bh(&ipt->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(a);
+ tcf_hash_insert(tn, a);
return ret;
err3:
@@ -170,6 +177,24 @@ err1:
return err;
}
+static int tcf_ipt_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a, int ovr,
+ int bind)
+{
+ struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+ return __tcf_ipt_init(tn, nla, est, a, ovr, bind);
+}
+
+static int tcf_xt_init(struct net *net, struct nlattr *nla,
+ struct nlattr *est, struct tc_action *a, int ovr,
+ int bind)
+{
+ struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+ return __tcf_ipt_init(tn, nla, est, a, ovr, bind);
+}
+
static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
@@ -195,6 +220,7 @@ static int tcf_ipt(struct sk_buff *skb, const struct tc_action *a,
par.hooknum = ipt->tcfi_hook;
par.target = ipt->tcfi_t->u.kernel.target;
par.targinfo = ipt->tcfi_t->data;
+ par.family = NFPROTO_IPV4;
ret = par.target->target(skb, &par);
switch (ret) {
@@ -260,6 +286,22 @@ nla_put_failure:
return -1;
}
+static int tcf_ipt_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ struct tc_action *a)
+{
+ struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_ipt_search(struct net *net, struct tc_action *a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
static struct tc_action_ops act_ipt_ops = {
.kind = "ipt",
.type = TCA_ACT_IPT,
@@ -268,8 +310,47 @@ static struct tc_action_ops act_ipt_ops = {
.dump = tcf_ipt_dump,
.cleanup = tcf_ipt_release,
.init = tcf_ipt_init,
+ .walk = tcf_ipt_walker,
+ .lookup = tcf_ipt_search,
+};
+
+static __net_init int ipt_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+ return tc_action_net_init(tn, &act_ipt_ops, IPT_TAB_MASK);
+}
+
+static void __net_exit ipt_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, ipt_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations ipt_net_ops = {
+ .init = ipt_init_net,
+ .exit = ipt_exit_net,
+ .id = &ipt_net_id,
+ .size = sizeof(struct tc_action_net),
};
+static int tcf_xt_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ struct tc_action *a)
+{
+ struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_xt_search(struct net *net, struct tc_action *a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
static struct tc_action_ops act_xt_ops = {
.kind = "xt",
.type = TCA_ACT_XT,
@@ -277,7 +358,30 @@ static struct tc_action_ops act_xt_ops = {
.act = tcf_ipt,
.dump = tcf_ipt_dump,
.cleanup = tcf_ipt_release,
- .init = tcf_ipt_init,
+ .init = tcf_xt_init,
+ .walk = tcf_xt_walker,
+ .lookup = tcf_xt_search,
+};
+
+static __net_init int xt_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+ return tc_action_net_init(tn, &act_xt_ops, IPT_TAB_MASK);
+}
+
+static void __net_exit xt_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, xt_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations xt_net_ops = {
+ .init = xt_init_net,
+ .exit = xt_exit_net,
+ .id = &xt_net_id,
+ .size = sizeof(struct tc_action_net),
};
MODULE_AUTHOR("Jamal Hadi Salim(2002-13)");
@@ -289,12 +393,13 @@ static int __init ipt_init_module(void)
{
int ret1, ret2;
- ret1 = tcf_register_action(&act_xt_ops, IPT_TAB_MASK);
+ ret1 = tcf_register_action(&act_xt_ops, &xt_net_ops);
if (ret1 < 0)
- printk("Failed to load xt action\n");
- ret2 = tcf_register_action(&act_ipt_ops, IPT_TAB_MASK);
+ pr_err("Failed to load xt action\n");
+
+ ret2 = tcf_register_action(&act_ipt_ops, &ipt_net_ops);
if (ret2 < 0)
- printk("Failed to load ipt action\n");
+ pr_err("Failed to load ipt action\n");
if (ret1 < 0 && ret2 < 0) {
return ret1;
@@ -304,8 +409,8 @@ static int __init ipt_init_module(void)
static void __exit ipt_cleanup_module(void)
{
- tcf_unregister_action(&act_xt_ops);
- tcf_unregister_action(&act_ipt_ops);
+ tcf_unregister_action(&act_ipt_ops, &ipt_net_ops);
+ tcf_unregister_action(&act_xt_ops, &xt_net_ops);
}
module_init(ipt_init_module);
diff --git a/net/sched/act_meta_mark.c b/net/sched/act_meta_mark.c
new file mode 100644
index 000000000000..82892170ce4f
--- /dev/null
+++ b/net/sched/act_meta_mark.c
@@ -0,0 +1,79 @@
+/*
+ * net/sched/act_meta_mark.c IFE skb->mark metadata module
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2015)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+#include <linux/rtnetlink.h>
+
+static int skbmark_encode(struct sk_buff *skb, void *skbdata,
+ struct tcf_meta_info *e)
+{
+ u32 ifemark = skb->mark;
+
+ return ife_encode_meta_u32(ifemark, skbdata, e);
+}
+
+static int skbmark_decode(struct sk_buff *skb, void *data, u16 len)
+{
+ u32 ifemark = *(u32 *)data;
+
+ skb->mark = ntohl(ifemark);
+ return 0;
+}
+
+static int skbmark_check(struct sk_buff *skb, struct tcf_meta_info *e)
+{
+ return ife_check_meta_u32(skb->mark, e);
+}
+
+static struct tcf_meta_ops ife_skbmark_ops = {
+ .metaid = IFE_META_SKBMARK,
+ .metatype = NLA_U32,
+ .name = "skbmark",
+ .synopsis = "skb mark 32 bit metadata",
+ .check_presence = skbmark_check,
+ .encode = skbmark_encode,
+ .decode = skbmark_decode,
+ .get = ife_get_meta_u32,
+ .alloc = ife_alloc_meta_u32,
+ .release = ife_release_meta_gen,
+ .validate = ife_validate_meta_u32,
+ .owner = THIS_MODULE,
+};
+
+static int __init ifemark_init_module(void)
+{
+ return register_ife_op(&ife_skbmark_ops);
+}
+
+static void __exit ifemark_cleanup_module(void)
+{
+ unregister_ife_op(&ife_skbmark_ops);
+}
+
+module_init(ifemark_init_module);
+module_exit(ifemark_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2015)");
+MODULE_DESCRIPTION("Inter-FE skb mark metadata module");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_IFE_META(IFE_META_SKBMARK);
diff --git a/net/sched/act_meta_skbprio.c b/net/sched/act_meta_skbprio.c
new file mode 100644
index 000000000000..26bf4d86030b
--- /dev/null
+++ b/net/sched/act_meta_skbprio.c
@@ -0,0 +1,76 @@
+/*
+ * net/sched/act_meta_prio.c IFE skb->priority metadata module
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ *
+ * copyright Jamal Hadi Salim (2015)
+ *
+*/
+
+#include <linux/types.h>
+#include <linux/kernel.h>
+#include <linux/string.h>
+#include <linux/errno.h>
+#include <linux/skbuff.h>
+#include <linux/rtnetlink.h>
+#include <linux/module.h>
+#include <linux/init.h>
+#include <net/netlink.h>
+#include <net/pkt_sched.h>
+#include <uapi/linux/tc_act/tc_ife.h>
+#include <net/tc_act/tc_ife.h>
+
+static int skbprio_check(struct sk_buff *skb, struct tcf_meta_info *e)
+{
+ return ife_check_meta_u32(skb->priority, e);
+}
+
+static int skbprio_encode(struct sk_buff *skb, void *skbdata,
+ struct tcf_meta_info *e)
+{
+ u32 ifeprio = skb->priority; /* avoid having to cast skb->priority*/
+
+ return ife_encode_meta_u32(ifeprio, skbdata, e);
+}
+
+static int skbprio_decode(struct sk_buff *skb, void *data, u16 len)
+{
+ u32 ifeprio = *(u32 *)data;
+
+ skb->priority = ntohl(ifeprio);
+ return 0;
+}
+
+static struct tcf_meta_ops ife_prio_ops = {
+ .metaid = IFE_META_PRIO,
+ .metatype = NLA_U32,
+ .name = "skbprio",
+ .synopsis = "skb prio metadata",
+ .check_presence = skbprio_check,
+ .encode = skbprio_encode,
+ .decode = skbprio_decode,
+ .get = ife_get_meta_u32,
+ .alloc = ife_alloc_meta_u32,
+ .owner = THIS_MODULE,
+};
+
+static int __init ifeprio_init_module(void)
+{
+ return register_ife_op(&ife_prio_ops);
+}
+
+static void __exit ifeprio_cleanup_module(void)
+{
+ unregister_ife_op(&ife_prio_ops);
+}
+
+module_init(ifeprio_init_module);
+module_exit(ifeprio_cleanup_module);
+
+MODULE_AUTHOR("Jamal Hadi Salim(2015)");
+MODULE_DESCRIPTION("Inter-FE skb prio metadata action");
+MODULE_LICENSE("GPL");
+MODULE_ALIAS_IFE_META(IFE_META_PRIO);
diff --git a/net/sched/act_mirred.c b/net/sched/act_mirred.c
index 32fcdecdb9e2..e8a760cf7775 100644
--- a/net/sched/act_mirred.c
+++ b/net/sched/act_mirred.c
@@ -50,10 +50,13 @@ static const struct nla_policy mirred_policy[TCA_MIRRED_MAX + 1] = {
[TCA_MIRRED_PARMS] = { .len = sizeof(struct tc_mirred) },
};
+static int mirred_net_id;
+
static int tcf_mirred_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a, int ovr,
int bind)
{
+ struct tc_action_net *tn = net_generic(net, mirred_net_id);
struct nlattr *tb[TCA_MIRRED_MAX + 1];
struct tc_mirred *parm;
struct tcf_mirred *m;
@@ -96,11 +99,11 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
dev = NULL;
}
- if (!tcf_hash_check(parm->index, a, bind)) {
+ if (!tcf_hash_check(tn, parm->index, a, bind)) {
if (dev == NULL)
return -EINVAL;
- ret = tcf_hash_create(parm->index, est, a, sizeof(*m),
- bind, true);
+ ret = tcf_hash_create(tn, parm->index, est, a,
+ sizeof(*m), bind, true);
if (ret)
return ret;
ret = ACT_P_CREATED;
@@ -130,7 +133,7 @@ static int tcf_mirred_init(struct net *net, struct nlattr *nla,
spin_lock_bh(&mirred_list_lock);
list_add(&m->tcfm_list, &mirred_list);
spin_unlock_bh(&mirred_list_lock);
- tcf_hash_insert(a);
+ tcf_hash_insert(tn, a);
}
return ret;
@@ -179,7 +182,6 @@ static int tcf_mirred(struct sk_buff *skb, const struct tc_action *a,
skb2->skb_iif = skb->dev->ifindex;
skb2->dev = dev;
- skb_sender_cpu_clear(skb2);
err = dev_queue_xmit(skb2);
if (err) {
@@ -221,6 +223,22 @@ nla_put_failure:
return -1;
}
+static int tcf_mirred_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ struct tc_action *a)
+{
+ struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_mirred_search(struct net *net, struct tc_action *a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
static int mirred_device_event(struct notifier_block *unused,
unsigned long event, void *ptr)
{
@@ -257,6 +275,29 @@ static struct tc_action_ops act_mirred_ops = {
.dump = tcf_mirred_dump,
.cleanup = tcf_mirred_release,
.init = tcf_mirred_init,
+ .walk = tcf_mirred_walker,
+ .lookup = tcf_mirred_search,
+};
+
+static __net_init int mirred_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+ return tc_action_net_init(tn, &act_mirred_ops, MIRRED_TAB_MASK);
+}
+
+static void __net_exit mirred_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, mirred_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations mirred_net_ops = {
+ .init = mirred_init_net,
+ .exit = mirred_exit_net,
+ .id = &mirred_net_id,
+ .size = sizeof(struct tc_action_net),
};
MODULE_AUTHOR("Jamal Hadi Salim(2002)");
@@ -270,12 +311,12 @@ static int __init mirred_init_module(void)
return err;
pr_info("Mirror/redirect action on\n");
- return tcf_register_action(&act_mirred_ops, MIRRED_TAB_MASK);
+ return tcf_register_action(&act_mirred_ops, &mirred_net_ops);
}
static void __exit mirred_cleanup_module(void)
{
- tcf_unregister_action(&act_mirred_ops);
+ tcf_unregister_action(&act_mirred_ops, &mirred_net_ops);
unregister_netdevice_notifier(&mirred_device_notifier);
}
diff --git a/net/sched/act_nat.c b/net/sched/act_nat.c
index b7c4ead8b5a8..0f65cdfbfb1d 100644
--- a/net/sched/act_nat.c
+++ b/net/sched/act_nat.c
@@ -31,6 +31,8 @@
#define NAT_TAB_MASK 15
+static int nat_net_id;
+
static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
[TCA_NAT_PARMS] = { .len = sizeof(struct tc_nat) },
};
@@ -38,6 +40,7 @@ static const struct nla_policy nat_policy[TCA_NAT_MAX + 1] = {
static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
struct tc_action *a, int ovr, int bind)
{
+ struct tc_action_net *tn = net_generic(net, nat_net_id);
struct nlattr *tb[TCA_NAT_MAX + 1];
struct tc_nat *parm;
int ret = 0, err;
@@ -54,9 +57,9 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
return -EINVAL;
parm = nla_data(tb[TCA_NAT_PARMS]);
- if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
- bind, false);
+ if (!tcf_hash_check(tn, parm->index, a, bind)) {
+ ret = tcf_hash_create(tn, parm->index, est, a,
+ sizeof(*p), bind, false);
if (ret)
return ret;
ret = ACT_P_CREATED;
@@ -79,7 +82,7 @@ static int tcf_nat_init(struct net *net, struct nlattr *nla, struct nlattr *est,
spin_unlock_bh(&p->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(a);
+ tcf_hash_insert(tn, a);
return ret;
}
@@ -126,9 +129,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
addr = iph->daddr;
if (!((old_addr ^ addr) & mask)) {
- if (skb_cloned(skb) &&
- !skb_clone_writable(skb, sizeof(*iph) + noff) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_try_make_writable(skb, sizeof(*iph) + noff))
goto drop;
new_addr &= mask;
@@ -156,9 +157,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
struct tcphdr *tcph;
if (!pskb_may_pull(skb, ihl + sizeof(*tcph) + noff) ||
- (skb_cloned(skb) &&
- !skb_clone_writable(skb, ihl + sizeof(*tcph) + noff) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ skb_try_make_writable(skb, ihl + sizeof(*tcph) + noff))
goto drop;
tcph = (void *)(skb_network_header(skb) + ihl);
@@ -171,9 +170,7 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
struct udphdr *udph;
if (!pskb_may_pull(skb, ihl + sizeof(*udph) + noff) ||
- (skb_cloned(skb) &&
- !skb_clone_writable(skb, ihl + sizeof(*udph) + noff) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC)))
+ skb_try_make_writable(skb, ihl + sizeof(*udph) + noff))
goto drop;
udph = (void *)(skb_network_header(skb) + ihl);
@@ -213,10 +210,8 @@ static int tcf_nat(struct sk_buff *skb, const struct tc_action *a,
if ((old_addr ^ addr) & mask)
break;
- if (skb_cloned(skb) &&
- !skb_clone_writable(skb, ihl + sizeof(*icmph) +
- sizeof(*iph) + noff) &&
- pskb_expand_head(skb, 0, 0, GFP_ATOMIC))
+ if (skb_try_make_writable(skb, ihl + sizeof(*icmph) +
+ sizeof(*iph) + noff))
goto drop;
icmph = (void *)(skb_network_header(skb) + ihl);
@@ -282,6 +277,22 @@ nla_put_failure:
return -1;
}
+static int tcf_nat_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ struct tc_action *a)
+{
+ struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_nat_search(struct net *net, struct tc_action *a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
static struct tc_action_ops act_nat_ops = {
.kind = "nat",
.type = TCA_ACT_NAT,
@@ -289,6 +300,29 @@ static struct tc_action_ops act_nat_ops = {
.act = tcf_nat,
.dump = tcf_nat_dump,
.init = tcf_nat_init,
+ .walk = tcf_nat_walker,
+ .lookup = tcf_nat_search,
+};
+
+static __net_init int nat_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+ return tc_action_net_init(tn, &act_nat_ops, NAT_TAB_MASK);
+}
+
+static void __net_exit nat_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, nat_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations nat_net_ops = {
+ .init = nat_init_net,
+ .exit = nat_exit_net,
+ .id = &nat_net_id,
+ .size = sizeof(struct tc_action_net),
};
MODULE_DESCRIPTION("Stateless NAT actions");
@@ -296,12 +330,12 @@ MODULE_LICENSE("GPL");
static int __init nat_init_module(void)
{
- return tcf_register_action(&act_nat_ops, NAT_TAB_MASK);
+ return tcf_register_action(&act_nat_ops, &nat_net_ops);
}
static void __exit nat_cleanup_module(void)
{
- tcf_unregister_action(&act_nat_ops);
+ tcf_unregister_action(&act_nat_ops, &nat_net_ops);
}
module_init(nat_init_module);
diff --git a/net/sched/act_pedit.c b/net/sched/act_pedit.c
index e38a7701f154..429c3ab65142 100644
--- a/net/sched/act_pedit.c
+++ b/net/sched/act_pedit.c
@@ -25,6 +25,8 @@
#define PEDIT_TAB_MASK 15
+static int pedit_net_id;
+
static const struct nla_policy pedit_policy[TCA_PEDIT_MAX + 1] = {
[TCA_PEDIT_PARMS] = { .len = sizeof(struct tc_pedit) },
};
@@ -33,6 +35,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a,
int ovr, int bind)
{
+ struct tc_action_net *tn = net_generic(net, pedit_net_id);
struct nlattr *tb[TCA_PEDIT_MAX + 1];
struct tc_pedit *parm;
int ret = 0, err;
@@ -54,11 +57,11 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
if (nla_len(tb[TCA_PEDIT_PARMS]) < sizeof(*parm) + ksize)
return -EINVAL;
- if (!tcf_hash_check(parm->index, a, bind)) {
+ if (!tcf_hash_check(tn, parm->index, a, bind)) {
if (!parm->nkeys)
return -EINVAL;
- ret = tcf_hash_create(parm->index, est, a, sizeof(*p),
- bind, false);
+ ret = tcf_hash_create(tn, parm->index, est, a,
+ sizeof(*p), bind, false);
if (ret)
return ret;
p = to_pedit(a);
@@ -93,7 +96,7 @@ static int tcf_pedit_init(struct net *net, struct nlattr *nla,
memcpy(p->tcfp_keys, parm->keys, ksize);
spin_unlock_bh(&p->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(a);
+ tcf_hash_insert(tn, a);
return ret;
}
@@ -211,6 +214,22 @@ nla_put_failure:
return -1;
}
+static int tcf_pedit_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ struct tc_action *a)
+{
+ struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_pedit_search(struct net *net, struct tc_action *a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
static struct tc_action_ops act_pedit_ops = {
.kind = "pedit",
.type = TCA_ACT_PEDIT,
@@ -219,6 +238,29 @@ static struct tc_action_ops act_pedit_ops = {
.dump = tcf_pedit_dump,
.cleanup = tcf_pedit_cleanup,
.init = tcf_pedit_init,
+ .walk = tcf_pedit_walker,
+ .lookup = tcf_pedit_search,
+};
+
+static __net_init int pedit_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+ return tc_action_net_init(tn, &act_pedit_ops, PEDIT_TAB_MASK);
+}
+
+static void __net_exit pedit_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, pedit_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations pedit_net_ops = {
+ .init = pedit_init_net,
+ .exit = pedit_exit_net,
+ .id = &pedit_net_id,
+ .size = sizeof(struct tc_action_net),
};
MODULE_AUTHOR("Jamal Hadi Salim(2002-4)");
@@ -227,12 +269,12 @@ MODULE_LICENSE("GPL");
static int __init pedit_init_module(void)
{
- return tcf_register_action(&act_pedit_ops, PEDIT_TAB_MASK);
+ return tcf_register_action(&act_pedit_ops, &pedit_net_ops);
}
static void __exit pedit_cleanup_module(void)
{
- tcf_unregister_action(&act_pedit_ops);
+ tcf_unregister_action(&act_pedit_ops, &pedit_net_ops);
}
module_init(pedit_init_module);
diff --git a/net/sched/act_police.c b/net/sched/act_police.c
index 9a1c42a43f92..330f14e302e8 100644
--- a/net/sched/act_police.c
+++ b/net/sched/act_police.c
@@ -55,10 +55,14 @@ struct tc_police_compat {
/* Each policer is serialized by its individual spinlock */
-static int tcf_act_police_walker(struct sk_buff *skb, struct netlink_callback *cb,
- int type, struct tc_action *a)
+static int police_net_id;
+
+static int tcf_act_police_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ struct tc_action *a)
{
- struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ struct tc_action_net *tn = net_generic(net, police_net_id);
+ struct tcf_hashinfo *hinfo = tn->hinfo;
struct hlist_head *head;
struct tcf_common *p;
int err = 0, index = -1, i = 0, s_i = 0, n_i = 0;
@@ -121,7 +125,8 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
struct tc_police *parm;
struct tcf_police *police;
struct qdisc_rate_table *R_tab = NULL, *P_tab = NULL;
- struct tcf_hashinfo *hinfo = a->ops->hinfo;
+ struct tc_action_net *tn = net_generic(net, police_net_id);
+ struct tcf_hashinfo *hinfo = tn->hinfo;
int size;
if (nla == NULL)
@@ -139,7 +144,7 @@ static int tcf_act_police_locate(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_POLICE_TBF]);
if (parm->index) {
- if (tcf_hash_search(a, parm->index)) {
+ if (tcf_hash_search(tn, a, parm->index)) {
police = to_police(a->priv);
if (bind) {
police->tcf_bindcnt += 1;
@@ -233,7 +238,7 @@ override:
police->tcfp_t_c = ktime_get_ns();
police->tcf_index = parm->index ? parm->index :
- tcf_hash_new_index(hinfo);
+ tcf_hash_new_index(tn);
h = tcf_hash(police->tcf_index, POL_TAB_MASK);
spin_lock_bh(&hinfo->lock);
hlist_add_head(&police->tcf_head, &hinfo->htab[h]);
@@ -342,6 +347,13 @@ nla_put_failure:
return -1;
}
+static int tcf_police_search(struct net *net, struct tc_action *a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, police_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
MODULE_AUTHOR("Alexey Kuznetsov");
MODULE_DESCRIPTION("Policing actions");
MODULE_LICENSE("GPL");
@@ -353,19 +365,41 @@ static struct tc_action_ops act_police_ops = {
.act = tcf_act_police,
.dump = tcf_act_police_dump,
.init = tcf_act_police_locate,
- .walk = tcf_act_police_walker
+ .walk = tcf_act_police_walker,
+ .lookup = tcf_police_search,
+};
+
+static __net_init int police_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, police_net_id);
+
+ return tc_action_net_init(tn, &act_police_ops, POL_TAB_MASK);
+}
+
+static void __net_exit police_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, police_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations police_net_ops = {
+ .init = police_init_net,
+ .exit = police_exit_net,
+ .id = &police_net_id,
+ .size = sizeof(struct tc_action_net),
};
static int __init
police_init_module(void)
{
- return tcf_register_action(&act_police_ops, POL_TAB_MASK);
+ return tcf_register_action(&act_police_ops, &police_net_ops);
}
static void __exit
police_cleanup_module(void)
{
- tcf_unregister_action(&act_police_ops);
+ tcf_unregister_action(&act_police_ops, &police_net_ops);
}
module_init(police_init_module);
diff --git a/net/sched/act_simple.c b/net/sched/act_simple.c
index d6b708d6afdf..75b2be13fbcc 100644
--- a/net/sched/act_simple.c
+++ b/net/sched/act_simple.c
@@ -26,6 +26,8 @@
#define SIMP_TAB_MASK 7
+static int simp_net_id;
+
#define SIMP_MAX_DATA 32
static int tcf_simp(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
@@ -80,6 +82,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a,
int ovr, int bind)
{
+ struct tc_action_net *tn = net_generic(net, simp_net_id);
struct nlattr *tb[TCA_DEF_MAX + 1];
struct tc_defact *parm;
struct tcf_defact *d;
@@ -102,9 +105,9 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_DEF_PARMS]);
defdata = nla_data(tb[TCA_DEF_DATA]);
- if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*d),
- bind, false);
+ if (!tcf_hash_check(tn, parm->index, a, bind)) {
+ ret = tcf_hash_create(tn, parm->index, est, a,
+ sizeof(*d), bind, false);
if (ret)
return ret;
@@ -129,7 +132,7 @@ static int tcf_simp_init(struct net *net, struct nlattr *nla,
}
if (ret == ACT_P_CREATED)
- tcf_hash_insert(a);
+ tcf_hash_insert(tn, a);
return ret;
}
@@ -161,6 +164,22 @@ nla_put_failure:
return -1;
}
+static int tcf_simp_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ struct tc_action *a)
+{
+ struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_simp_search(struct net *net, struct tc_action *a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
static struct tc_action_ops act_simp_ops = {
.kind = "simple",
.type = TCA_ACT_SIMP,
@@ -169,6 +188,29 @@ static struct tc_action_ops act_simp_ops = {
.dump = tcf_simp_dump,
.cleanup = tcf_simp_release,
.init = tcf_simp_init,
+ .walk = tcf_simp_walker,
+ .lookup = tcf_simp_search,
+};
+
+static __net_init int simp_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+ return tc_action_net_init(tn, &act_simp_ops, SIMP_TAB_MASK);
+}
+
+static void __net_exit simp_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, simp_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations simp_net_ops = {
+ .init = simp_init_net,
+ .exit = simp_exit_net,
+ .id = &simp_net_id,
+ .size = sizeof(struct tc_action_net),
};
MODULE_AUTHOR("Jamal Hadi Salim(2005)");
@@ -177,8 +219,7 @@ MODULE_LICENSE("GPL");
static int __init simp_init_module(void)
{
- int ret;
- ret = tcf_register_action(&act_simp_ops, SIMP_TAB_MASK);
+ int ret = tcf_register_action(&act_simp_ops, &simp_net_ops);
if (!ret)
pr_info("Simple TC action Loaded\n");
return ret;
@@ -186,7 +227,7 @@ static int __init simp_init_module(void)
static void __exit simp_cleanup_module(void)
{
- tcf_unregister_action(&act_simp_ops);
+ tcf_unregister_action(&act_simp_ops, &simp_net_ops);
}
module_init(simp_init_module);
diff --git a/net/sched/act_skbedit.c b/net/sched/act_skbedit.c
index 6751b5f8c046..cfcdbdc00c9b 100644
--- a/net/sched/act_skbedit.c
+++ b/net/sched/act_skbedit.c
@@ -29,6 +29,8 @@
#define SKBEDIT_TAB_MASK 15
+static int skbedit_net_id;
+
static int tcf_skbedit(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
@@ -61,6 +63,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a,
int ovr, int bind)
{
+ struct tc_action_net *tn = net_generic(net, skbedit_net_id);
struct nlattr *tb[TCA_SKBEDIT_MAX + 1];
struct tc_skbedit *parm;
struct tcf_skbedit *d;
@@ -98,9 +101,9 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
parm = nla_data(tb[TCA_SKBEDIT_PARMS]);
- if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*d),
- bind, false);
+ if (!tcf_hash_check(tn, parm->index, a, bind)) {
+ ret = tcf_hash_create(tn, parm->index, est, a,
+ sizeof(*d), bind, false);
if (ret)
return ret;
@@ -130,7 +133,7 @@ static int tcf_skbedit_init(struct net *net, struct nlattr *nla,
spin_unlock_bh(&d->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(a);
+ tcf_hash_insert(tn, a);
return ret;
}
@@ -173,6 +176,22 @@ nla_put_failure:
return -1;
}
+static int tcf_skbedit_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ struct tc_action *a)
+{
+ struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_skbedit_search(struct net *net, struct tc_action *a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
static struct tc_action_ops act_skbedit_ops = {
.kind = "skbedit",
.type = TCA_ACT_SKBEDIT,
@@ -180,6 +199,29 @@ static struct tc_action_ops act_skbedit_ops = {
.act = tcf_skbedit,
.dump = tcf_skbedit_dump,
.init = tcf_skbedit_init,
+ .walk = tcf_skbedit_walker,
+ .lookup = tcf_skbedit_search,
+};
+
+static __net_init int skbedit_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+ return tc_action_net_init(tn, &act_skbedit_ops, SKBEDIT_TAB_MASK);
+}
+
+static void __net_exit skbedit_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, skbedit_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations skbedit_net_ops = {
+ .init = skbedit_init_net,
+ .exit = skbedit_exit_net,
+ .id = &skbedit_net_id,
+ .size = sizeof(struct tc_action_net),
};
MODULE_AUTHOR("Alexander Duyck, <alexander.h.duyck@intel.com>");
@@ -188,12 +230,12 @@ MODULE_LICENSE("GPL");
static int __init skbedit_init_module(void)
{
- return tcf_register_action(&act_skbedit_ops, SKBEDIT_TAB_MASK);
+ return tcf_register_action(&act_skbedit_ops, &skbedit_net_ops);
}
static void __exit skbedit_cleanup_module(void)
{
- tcf_unregister_action(&act_skbedit_ops);
+ tcf_unregister_action(&act_skbedit_ops, &skbedit_net_ops);
}
module_init(skbedit_init_module);
diff --git a/net/sched/act_vlan.c b/net/sched/act_vlan.c
index 796785e0bf96..bab8ae0cefc0 100644
--- a/net/sched/act_vlan.c
+++ b/net/sched/act_vlan.c
@@ -21,6 +21,8 @@
#define VLAN_TAB_MASK 15
+static int vlan_net_id;
+
static int tcf_vlan(struct sk_buff *skb, const struct tc_action *a,
struct tcf_result *res)
{
@@ -68,6 +70,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
struct nlattr *est, struct tc_action *a,
int ovr, int bind)
{
+ struct tc_action_net *tn = net_generic(net, vlan_net_id);
struct nlattr *tb[TCA_VLAN_MAX + 1];
struct tc_vlan *parm;
struct tcf_vlan *v;
@@ -115,9 +118,9 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
}
action = parm->v_action;
- if (!tcf_hash_check(parm->index, a, bind)) {
- ret = tcf_hash_create(parm->index, est, a, sizeof(*v),
- bind, false);
+ if (!tcf_hash_check(tn, parm->index, a, bind)) {
+ ret = tcf_hash_create(tn, parm->index, est, a,
+ sizeof(*v), bind, false);
if (ret)
return ret;
@@ -143,7 +146,7 @@ static int tcf_vlan_init(struct net *net, struct nlattr *nla,
spin_unlock_bh(&v->tcf_lock);
if (ret == ACT_P_CREATED)
- tcf_hash_insert(a);
+ tcf_hash_insert(tn, a);
return ret;
}
@@ -181,6 +184,22 @@ nla_put_failure:
return -1;
}
+static int tcf_vlan_walker(struct net *net, struct sk_buff *skb,
+ struct netlink_callback *cb, int type,
+ struct tc_action *a)
+{
+ struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+ return tcf_generic_walker(tn, skb, cb, type, a);
+}
+
+static int tcf_vlan_search(struct net *net, struct tc_action *a, u32 index)
+{
+ struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+ return tcf_hash_search(tn, a, index);
+}
+
static struct tc_action_ops act_vlan_ops = {
.kind = "vlan",
.type = TCA_ACT_VLAN,
@@ -188,16 +207,39 @@ static struct tc_action_ops act_vlan_ops = {
.act = tcf_vlan,
.dump = tcf_vlan_dump,
.init = tcf_vlan_init,
+ .walk = tcf_vlan_walker,
+ .lookup = tcf_vlan_search,
+};
+
+static __net_init int vlan_init_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+ return tc_action_net_init(tn, &act_vlan_ops, VLAN_TAB_MASK);
+}
+
+static void __net_exit vlan_exit_net(struct net *net)
+{
+ struct tc_action_net *tn = net_generic(net, vlan_net_id);
+
+ tc_action_net_exit(tn);
+}
+
+static struct pernet_operations vlan_net_ops = {
+ .init = vlan_init_net,
+ .exit = vlan_exit_net,
+ .id = &vlan_net_id,
+ .size = sizeof(struct tc_action_net),
};
static int __init vlan_init_module(void)
{
- return tcf_register_action(&act_vlan_ops, VLAN_TAB_MASK);
+ return tcf_register_action(&act_vlan_ops, &vlan_net_ops);
}
static void __exit vlan_cleanup_module(void)
{
- tcf_unregister_action(&act_vlan_ops);
+ tcf_unregister_action(&act_vlan_ops, &vlan_net_ops);
}
module_init(vlan_init_module);
diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c
index 5faaa5425f7b..425fe6a0eda3 100644
--- a/net/sched/cls_bpf.c
+++ b/net/sched/cls_bpf.c
@@ -79,12 +79,8 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
struct tcf_result *res)
{
struct cls_bpf_head *head = rcu_dereference_bh(tp->root);
+ bool at_ingress = skb_at_tc_ingress(skb);
struct cls_bpf_prog *prog;
-#ifdef CONFIG_NET_CLS_ACT
- bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
-#else
- bool at_ingress = false;
-#endif
int ret = -1;
if (unlikely(!skb_mac_header_was_set(skb)))
@@ -107,8 +103,9 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
}
if (prog->exts_integrated) {
- res->class = prog->res.class;
- res->classid = qdisc_skb_cb(skb)->tc_classid;
+ res->class = 0;
+ res->classid = TC_H_MAJ(prog->res.classid) |
+ qdisc_skb_cb(skb)->tc_classid;
ret = cls_bpf_exec_opcode(filter_res);
if (ret == TC_ACT_UNSPEC)
@@ -118,10 +115,12 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
if (filter_res == 0)
continue;
-
- *res = prog->res;
- if (filter_res != -1)
+ if (filter_res != -1) {
+ res->class = 0;
res->classid = filter_res;
+ } else {
+ *res = prog->res;
+ }
ret = tcf_exts_exec(skb, &prog->exts, res);
if (ret < 0)
@@ -295,7 +294,7 @@ static int cls_bpf_prog_from_efd(struct nlattr **tb, struct cls_bpf_prog *prog,
prog->bpf_name = name;
prog->filter = fp;
- if (fp->dst_needed)
+ if (fp->dst_needed && !(tp->q->flags & TCQ_F_INGRESS))
netif_keep_dst(qdisc_dev(tp->q));
return 0;
diff --git a/net/sched/cls_flow.c b/net/sched/cls_flow.c
index 536838b657bf..fbfec6a18839 100644
--- a/net/sched/cls_flow.c
+++ b/net/sched/cls_flow.c
@@ -22,6 +22,7 @@
#include <linux/if_vlan.h>
#include <linux/slab.h>
#include <linux/module.h>
+#include <net/inet_sock.h>
#include <net/pkt_cls.h>
#include <net/ip.h>
@@ -197,8 +198,11 @@ static u32 flow_get_rtclassid(const struct sk_buff *skb)
static u32 flow_get_skuid(const struct sk_buff *skb)
{
- if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
- kuid_t skuid = skb->sk->sk_socket->file->f_cred->fsuid;
+ struct sock *sk = skb_to_full_sk(skb);
+
+ if (sk && sk->sk_socket && sk->sk_socket->file) {
+ kuid_t skuid = sk->sk_socket->file->f_cred->fsuid;
+
return from_kuid(&init_user_ns, skuid);
}
return 0;
@@ -206,8 +210,11 @@ static u32 flow_get_skuid(const struct sk_buff *skb)
static u32 flow_get_skgid(const struct sk_buff *skb)
{
- if (skb->sk && skb->sk->sk_socket && skb->sk->sk_socket->file) {
- kgid_t skgid = skb->sk->sk_socket->file->f_cred->fsgid;
+ struct sock *sk = skb_to_full_sk(skb);
+
+ if (sk && sk->sk_socket && sk->sk_socket->file) {
+ kgid_t skgid = sk->sk_socket->file->f_cred->fsgid;
+
return from_kgid(&init_user_ns, skgid);
}
return 0;
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 57692947ebbe..2181ffc76638 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -165,6 +165,51 @@ static void fl_destroy_filter(struct rcu_head *head)
kfree(f);
}
+static void fl_hw_destroy_filter(struct tcf_proto *tp, unsigned long cookie)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct tc_cls_flower_offload offload = {0};
+ struct tc_to_netdev tc;
+
+ if (!tc_should_offload(dev, 0))
+ return;
+
+ offload.command = TC_CLSFLOWER_DESTROY;
+ offload.cookie = cookie;
+
+ tc.type = TC_SETUP_CLSFLOWER;
+ tc.cls_flower = &offload;
+
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+}
+
+static void fl_hw_replace_filter(struct tcf_proto *tp,
+ struct flow_dissector *dissector,
+ struct fl_flow_key *mask,
+ struct fl_flow_key *key,
+ struct tcf_exts *actions,
+ unsigned long cookie, u32 flags)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct tc_cls_flower_offload offload = {0};
+ struct tc_to_netdev tc;
+
+ if (!tc_should_offload(dev, flags))
+ return;
+
+ offload.command = TC_CLSFLOWER_REPLACE;
+ offload.cookie = cookie;
+ offload.dissector = dissector;
+ offload.mask = mask;
+ offload.key = key;
+ offload.exts = actions;
+
+ tc.type = TC_SETUP_CLSFLOWER;
+ tc.cls_flower = &offload;
+
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle, tp->protocol, &tc);
+}
+
static bool fl_destroy(struct tcf_proto *tp, bool force)
{
struct cls_fl_head *head = rtnl_dereference(tp->root);
@@ -174,6 +219,7 @@ static bool fl_destroy(struct tcf_proto *tp, bool force)
return false;
list_for_each_entry_safe(f, next, &head->filters, list) {
+ fl_hw_destroy_filter(tp, (unsigned long)f);
list_del_rcu(&f->list);
call_rcu(&f->rcu, fl_destroy_filter);
}
@@ -252,23 +298,28 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
fl_set_key_val(tb, key->eth.src, TCA_FLOWER_KEY_ETH_SRC,
mask->eth.src, TCA_FLOWER_KEY_ETH_SRC_MASK,
sizeof(key->eth.src));
+
fl_set_key_val(tb, &key->basic.n_proto, TCA_FLOWER_KEY_ETH_TYPE,
&mask->basic.n_proto, TCA_FLOWER_UNSPEC,
sizeof(key->basic.n_proto));
+
if (key->basic.n_proto == htons(ETH_P_IP) ||
key->basic.n_proto == htons(ETH_P_IPV6)) {
fl_set_key_val(tb, &key->basic.ip_proto, TCA_FLOWER_KEY_IP_PROTO,
&mask->basic.ip_proto, TCA_FLOWER_UNSPEC,
sizeof(key->basic.ip_proto));
}
- if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV4_ADDRS) {
+
+ if (tb[TCA_FLOWER_KEY_IPV4_SRC] || tb[TCA_FLOWER_KEY_IPV4_DST]) {
+ key->control.addr_type = FLOW_DISSECTOR_KEY_IPV4_ADDRS;
fl_set_key_val(tb, &key->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC,
&mask->ipv4.src, TCA_FLOWER_KEY_IPV4_SRC_MASK,
sizeof(key->ipv4.src));
fl_set_key_val(tb, &key->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST,
&mask->ipv4.dst, TCA_FLOWER_KEY_IPV4_DST_MASK,
sizeof(key->ipv4.dst));
- } else if (key->control.addr_type == FLOW_DISSECTOR_KEY_IPV6_ADDRS) {
+ } else if (tb[TCA_FLOWER_KEY_IPV6_SRC] || tb[TCA_FLOWER_KEY_IPV6_DST]) {
+ key->control.addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
fl_set_key_val(tb, &key->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC,
&mask->ipv6.src, TCA_FLOWER_KEY_IPV6_SRC_MASK,
sizeof(key->ipv6.src));
@@ -276,6 +327,7 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
&mask->ipv6.dst, TCA_FLOWER_KEY_IPV6_DST_MASK,
sizeof(key->ipv6.dst));
}
+
if (key->basic.ip_proto == IPPROTO_TCP) {
fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
&mask->tp.src, TCA_FLOWER_UNSPEC,
@@ -453,6 +505,7 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
struct cls_fl_filter *fnew;
struct nlattr *tb[TCA_FLOWER_MAX + 1];
struct fl_flow_mask mask = {};
+ u32 flags = 0;
int err;
if (!tca[TCA_OPTIONS])
@@ -480,6 +533,9 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
}
fnew->handle = handle;
+ if (tb[TCA_FLOWER_FLAGS])
+ flags = nla_get_u32(tb[TCA_FLOWER_FLAGS]);
+
err = fl_set_parms(net, tp, fnew, &mask, base, tb, tca[TCA_RATE], ovr);
if (err)
goto errout;
@@ -492,9 +548,20 @@ static int fl_change(struct net *net, struct sk_buff *in_skb,
head->ht_params);
if (err)
goto errout;
- if (fold)
+
+ fl_hw_replace_filter(tp,
+ &head->dissector,
+ &mask.key,
+ &fnew->key,
+ &fnew->exts,
+ (unsigned long)fnew,
+ flags);
+
+ if (fold) {
rhashtable_remove_fast(&head->ht, &fold->ht_node,
head->ht_params);
+ fl_hw_destroy_filter(tp, (unsigned long)fold);
+ }
*arg = (unsigned long) fnew;
@@ -521,6 +588,7 @@ static int fl_delete(struct tcf_proto *tp, unsigned long arg)
rhashtable_remove_fast(&head->ht, &f->ht_node,
head->ht_params);
list_del_rcu(&f->list);
+ fl_hw_destroy_filter(tp, (unsigned long)f);
tcf_unbind_filter(tp, &f->res);
call_rcu(&f->rcu, fl_destroy_filter);
return 0;
diff --git a/net/sched/cls_u32.c b/net/sched/cls_u32.c
index 4fbb67430ce4..563cdad76448 100644
--- a/net/sched/cls_u32.c
+++ b/net/sched/cls_u32.c
@@ -43,6 +43,7 @@
#include <net/netlink.h>
#include <net/act_api.h>
#include <net/pkt_cls.h>
+#include <linux/netdevice.h>
struct tc_u_knode {
struct tc_u_knode __rcu *next;
@@ -58,6 +59,7 @@ struct tc_u_knode {
#ifdef CONFIG_CLS_U32_PERF
struct tc_u32_pcnt __percpu *pf;
#endif
+ u32 flags;
#ifdef CONFIG_CLS_U32_MARK
u32 val;
u32 mask;
@@ -424,6 +426,97 @@ static int u32_delete_key(struct tcf_proto *tp, struct tc_u_knode *key)
return 0;
}
+static void u32_remove_hw_knode(struct tcf_proto *tp, u32 handle)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct tc_cls_u32_offload u32_offload = {0};
+ struct tc_to_netdev offload;
+
+ offload.type = TC_SETUP_CLSU32;
+ offload.cls_u32 = &u32_offload;
+
+ if (tc_should_offload(dev, 0)) {
+ offload.cls_u32->command = TC_CLSU32_DELETE_KNODE;
+ offload.cls_u32->knode.handle = handle;
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+ tp->protocol, &offload);
+ }
+}
+
+static void u32_replace_hw_hnode(struct tcf_proto *tp,
+ struct tc_u_hnode *h,
+ u32 flags)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct tc_cls_u32_offload u32_offload = {0};
+ struct tc_to_netdev offload;
+
+ offload.type = TC_SETUP_CLSU32;
+ offload.cls_u32 = &u32_offload;
+
+ if (tc_should_offload(dev, flags)) {
+ offload.cls_u32->command = TC_CLSU32_NEW_HNODE;
+ offload.cls_u32->hnode.divisor = h->divisor;
+ offload.cls_u32->hnode.handle = h->handle;
+ offload.cls_u32->hnode.prio = h->prio;
+
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+ tp->protocol, &offload);
+ }
+}
+
+static void u32_clear_hw_hnode(struct tcf_proto *tp, struct tc_u_hnode *h)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct tc_cls_u32_offload u32_offload = {0};
+ struct tc_to_netdev offload;
+
+ offload.type = TC_SETUP_CLSU32;
+ offload.cls_u32 = &u32_offload;
+
+ if (tc_should_offload(dev, 0)) {
+ offload.cls_u32->command = TC_CLSU32_DELETE_HNODE;
+ offload.cls_u32->hnode.divisor = h->divisor;
+ offload.cls_u32->hnode.handle = h->handle;
+ offload.cls_u32->hnode.prio = h->prio;
+
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+ tp->protocol, &offload);
+ }
+}
+
+static void u32_replace_hw_knode(struct tcf_proto *tp,
+ struct tc_u_knode *n,
+ u32 flags)
+{
+ struct net_device *dev = tp->q->dev_queue->dev;
+ struct tc_cls_u32_offload u32_offload = {0};
+ struct tc_to_netdev offload;
+
+ offload.type = TC_SETUP_CLSU32;
+ offload.cls_u32 = &u32_offload;
+
+ if (tc_should_offload(dev, flags)) {
+ offload.cls_u32->command = TC_CLSU32_REPLACE_KNODE;
+ offload.cls_u32->knode.handle = n->handle;
+ offload.cls_u32->knode.fshift = n->fshift;
+#ifdef CONFIG_CLS_U32_MARK
+ offload.cls_u32->knode.val = n->val;
+ offload.cls_u32->knode.mask = n->mask;
+#else
+ offload.cls_u32->knode.val = 0;
+ offload.cls_u32->knode.mask = 0;
+#endif
+ offload.cls_u32->knode.sel = &n->sel;
+ offload.cls_u32->knode.exts = &n->exts;
+ if (n->ht_down)
+ offload.cls_u32->knode.link_handle = n->ht_down->handle;
+
+ dev->netdev_ops->ndo_setup_tc(dev, tp->q->handle,
+ tp->protocol, &offload);
+ }
+}
+
static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
{
struct tc_u_knode *n;
@@ -434,6 +527,7 @@ static void u32_clear_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
RCU_INIT_POINTER(ht->ht[h],
rtnl_dereference(n->next));
tcf_unbind_filter(tp, &n->res);
+ u32_remove_hw_knode(tp, n->handle);
call_rcu(&n->rcu, u32_delete_key_freepf_rcu);
}
}
@@ -454,6 +548,7 @@ static int u32_destroy_hnode(struct tcf_proto *tp, struct tc_u_hnode *ht)
phn;
hn = &phn->next, phn = rtnl_dereference(*hn)) {
if (phn == ht) {
+ u32_clear_hw_hnode(tp, ht);
RCU_INIT_POINTER(*hn, ht->next);
kfree_rcu(ht, rcu);
return 0;
@@ -540,8 +635,10 @@ static int u32_delete(struct tcf_proto *tp, unsigned long arg)
if (ht == NULL)
return 0;
- if (TC_U32_KEY(ht->handle))
+ if (TC_U32_KEY(ht->handle)) {
+ u32_remove_hw_knode(tp, ht->handle);
return u32_delete_key(tp, (struct tc_u_knode *)ht);
+ }
if (root_ht == ht)
return -EINVAL;
@@ -587,6 +684,7 @@ static const struct nla_policy u32_policy[TCA_U32_MAX + 1] = {
[TCA_U32_SEL] = { .len = sizeof(struct tc_u32_sel) },
[TCA_U32_INDEV] = { .type = NLA_STRING, .len = IFNAMSIZ },
[TCA_U32_MARK] = { .len = sizeof(struct tc_u32_mark) },
+ [TCA_U32_FLAGS] = { .type = NLA_U32 },
};
static int u32_set_parms(struct net *net, struct tcf_proto *tp,
@@ -694,6 +792,7 @@ static struct tc_u_knode *u32_init_knode(struct tcf_proto *tp,
#endif
new->fshift = n->fshift;
new->res = n->res;
+ new->flags = n->flags;
RCU_INIT_POINTER(new->ht_down, n->ht_down);
/* bump reference count as long as we hold pointer to structure */
@@ -733,7 +832,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
struct tc_u32_sel *s;
struct nlattr *opt = tca[TCA_OPTIONS];
struct nlattr *tb[TCA_U32_MAX + 1];
- u32 htid;
+ u32 htid, flags = 0;
int err;
#ifdef CONFIG_CLS_U32_PERF
size_t size;
@@ -746,6 +845,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
if (err < 0)
return err;
+ if (tb[TCA_U32_FLAGS])
+ flags = nla_get_u32(tb[TCA_U32_FLAGS]);
+
n = (struct tc_u_knode *)*arg;
if (n) {
struct tc_u_knode *new;
@@ -753,6 +855,9 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
if (TC_U32_KEY(n->handle) == 0)
return -EINVAL;
+ if (n->flags != flags)
+ return -EINVAL;
+
new = u32_init_knode(tp, n);
if (!new)
return -ENOMEM;
@@ -769,6 +874,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
u32_replace_knode(tp, tp_c, new);
tcf_unbind_filter(tp, &n->res);
call_rcu(&n->rcu, u32_delete_key_rcu);
+ u32_replace_hw_knode(tp, new, flags);
return 0;
}
@@ -795,6 +901,8 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
RCU_INIT_POINTER(ht->next, tp_c->hlist);
rcu_assign_pointer(tp_c->hlist, ht);
*arg = (unsigned long)ht;
+
+ u32_replace_hw_hnode(tp, ht, flags);
return 0;
}
@@ -845,6 +953,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
RCU_INIT_POINTER(n->ht_up, ht);
n->handle = handle;
n->fshift = s->hmask ? ffs(ntohl(s->hmask)) - 1 : 0;
+ n->flags = flags;
tcf_exts_init(&n->exts, TCA_U32_ACT, TCA_U32_POLICE);
n->tp = tp;
@@ -877,7 +986,7 @@ static int u32_change(struct net *net, struct sk_buff *in_skb,
RCU_INIT_POINTER(n->next, pins);
rcu_assign_pointer(*ins, n);
-
+ u32_replace_hw_knode(tp, n, flags);
*arg = (unsigned long)n;
return 0;
}
@@ -982,6 +1091,9 @@ static int u32_dump(struct net *net, struct tcf_proto *tp, unsigned long fh,
nla_put_u32(skb, TCA_U32_LINK, ht_down->handle))
goto nla_put_failure;
+ if (n->flags && nla_put_u32(skb, TCA_U32_FLAGS, n->flags))
+ goto nla_put_failure;
+
#ifdef CONFIG_CLS_U32_MARK
if ((n->val || n->mask)) {
struct tc_u32_mark mark = {.val = n->val,
diff --git a/net/sched/em_meta.c b/net/sched/em_meta.c
index b5294ce20cd4..f2aabc0089da 100644
--- a/net/sched/em_meta.c
+++ b/net/sched/em_meta.c
@@ -343,119 +343,145 @@ META_COLLECTOR(int_sk_refcnt)
META_COLLECTOR(int_sk_rcvbuf)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_rcvbuf;
+ dst->value = sk->sk_rcvbuf;
}
META_COLLECTOR(int_sk_shutdown)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_shutdown;
+ dst->value = sk->sk_shutdown;
}
META_COLLECTOR(int_sk_proto)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_protocol;
+ dst->value = sk->sk_protocol;
}
META_COLLECTOR(int_sk_type)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_type;
+ dst->value = sk->sk_type;
}
META_COLLECTOR(int_sk_rmem_alloc)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = sk_rmem_alloc_get(skb->sk);
+ dst->value = sk_rmem_alloc_get(sk);
}
META_COLLECTOR(int_sk_wmem_alloc)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = sk_wmem_alloc_get(skb->sk);
+ dst->value = sk_wmem_alloc_get(sk);
}
META_COLLECTOR(int_sk_omem_alloc)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = atomic_read(&skb->sk->sk_omem_alloc);
+ dst->value = atomic_read(&sk->sk_omem_alloc);
}
META_COLLECTOR(int_sk_rcv_qlen)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_receive_queue.qlen;
+ dst->value = sk->sk_receive_queue.qlen;
}
META_COLLECTOR(int_sk_snd_qlen)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_write_queue.qlen;
+ dst->value = sk->sk_write_queue.qlen;
}
META_COLLECTOR(int_sk_wmem_queued)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_wmem_queued;
+ dst->value = sk->sk_wmem_queued;
}
META_COLLECTOR(int_sk_fwd_alloc)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_forward_alloc;
+ dst->value = sk->sk_forward_alloc;
}
META_COLLECTOR(int_sk_sndbuf)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_sndbuf;
+ dst->value = sk->sk_sndbuf;
}
META_COLLECTOR(int_sk_alloc)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = (__force int) skb->sk->sk_allocation;
+ dst->value = (__force int) sk->sk_allocation;
}
META_COLLECTOR(int_sk_hash)
@@ -469,92 +495,112 @@ META_COLLECTOR(int_sk_hash)
META_COLLECTOR(int_sk_lingertime)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_lingertime / HZ;
+ dst->value = sk->sk_lingertime / HZ;
}
META_COLLECTOR(int_sk_err_qlen)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_error_queue.qlen;
+ dst->value = sk->sk_error_queue.qlen;
}
META_COLLECTOR(int_sk_ack_bl)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_ack_backlog;
+ dst->value = sk->sk_ack_backlog;
}
META_COLLECTOR(int_sk_max_ack_bl)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_max_ack_backlog;
+ dst->value = sk->sk_max_ack_backlog;
}
META_COLLECTOR(int_sk_prio)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_priority;
+ dst->value = sk->sk_priority;
}
META_COLLECTOR(int_sk_rcvlowat)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_rcvlowat;
+ dst->value = sk->sk_rcvlowat;
}
META_COLLECTOR(int_sk_rcvtimeo)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_rcvtimeo / HZ;
+ dst->value = sk->sk_rcvtimeo / HZ;
}
META_COLLECTOR(int_sk_sndtimeo)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_sndtimeo / HZ;
+ dst->value = sk->sk_sndtimeo / HZ;
}
META_COLLECTOR(int_sk_sendmsg_off)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_frag.offset;
+ dst->value = sk->sk_frag.offset;
}
META_COLLECTOR(int_sk_write_pend)
{
- if (skip_nonlocal(skb)) {
+ const struct sock *sk = skb_to_full_sk(skb);
+
+ if (!sk) {
*err = -1;
return;
}
- dst->value = skb->sk->sk_write_pending;
+ dst->value = sk->sk_write_pending;
}
/**************************************************************************
diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index f43c8f33f09e..3b180ff72f79 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -253,7 +253,8 @@ int qdisc_set_default(const char *name)
}
/* We know handle. Find qdisc among all qdisc's attached to device
- (root qdisc, all its children, children of children etc.)
+ * (root qdisc, all its children, children of children etc.)
+ * Note: caller either uses rtnl or rcu_read_lock()
*/
static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
@@ -264,7 +265,7 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc *root, u32 handle)
root->handle == handle)
return root;
- list_for_each_entry(q, &root->list, list) {
+ list_for_each_entry_rcu(q, &root->list, list) {
if (q->handle == handle)
return q;
}
@@ -277,15 +278,18 @@ void qdisc_list_add(struct Qdisc *q)
struct Qdisc *root = qdisc_dev(q)->qdisc;
WARN_ON_ONCE(root == &noop_qdisc);
- list_add_tail(&q->list, &root->list);
+ ASSERT_RTNL();
+ list_add_tail_rcu(&q->list, &root->list);
}
}
EXPORT_SYMBOL(qdisc_list_add);
void qdisc_list_del(struct Qdisc *q)
{
- if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS))
- list_del(&q->list);
+ if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
+ ASSERT_RTNL();
+ list_del_rcu(&q->list);
+ }
}
EXPORT_SYMBOL(qdisc_list_del);
@@ -740,24 +744,29 @@ static u32 qdisc_alloc_handle(struct net_device *dev)
return 0;
}
-void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
+void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned int n,
+ unsigned int len)
{
const struct Qdisc_class_ops *cops;
unsigned long cl;
u32 parentid;
int drops;
- if (n == 0)
+ if (n == 0 && len == 0)
return;
drops = max_t(int, n, 0);
+ rcu_read_lock();
while ((parentid = sch->parent)) {
if (TC_H_MAJ(parentid) == TC_H_MAJ(TC_H_INGRESS))
- return;
+ break;
+ if (sch->flags & TCQ_F_NOPARENT)
+ break;
+ /* TODO: perform the search on a per txq basis */
sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
if (sch == NULL) {
- WARN_ON(parentid != TC_H_ROOT);
- return;
+ WARN_ON_ONCE(parentid != TC_H_ROOT);
+ break;
}
cops = sch->ops->cl_ops;
if (cops->qlen_notify) {
@@ -766,10 +775,12 @@ void qdisc_tree_decrease_qlen(struct Qdisc *sch, unsigned int n)
cops->put(sch, cl);
}
sch->q.qlen -= n;
+ sch->qstats.backlog -= len;
__qdisc_qstats_drop(sch, drops);
}
+ rcu_read_unlock();
}
-EXPORT_SYMBOL(qdisc_tree_decrease_qlen);
+EXPORT_SYMBOL(qdisc_tree_reduce_backlog);
static void notify_and_destroy(struct net *net, struct sk_buff *skb,
struct nlmsghdr *n, u32 clid,
@@ -1832,7 +1843,7 @@ reclassify:
return err;
}
- return -1;
+ return TC_ACT_UNSPEC; /* signal: continue lookup */
#ifdef CONFIG_NET_CLS_ACT
reset:
if (unlikely(limit++ >= MAX_REC_LOOP)) {
@@ -1843,6 +1854,7 @@ reset:
}
tp = old_tp;
+ protocol = tc_skb_protocol(skb);
goto reclassify;
#endif
}
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index c538d9e4a8f6..baafddf229ce 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1624,13 +1624,8 @@ static int cbq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
new->reshape_fail = cbq_reshape_fail;
#endif
}
- sch_tree_lock(sch);
- *old = cl->q;
- cl->q = new;
- qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
- qdisc_reset(*old);
- sch_tree_unlock(sch);
+ *old = qdisc_replace(sch, new, &cl->q);
return 0;
}
@@ -1914,7 +1909,7 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
{
struct cbq_sched_data *q = qdisc_priv(sch);
struct cbq_class *cl = (struct cbq_class *)arg;
- unsigned int qlen;
+ unsigned int qlen, backlog;
if (cl->filters || cl->children || cl == &q->link)
return -EBUSY;
@@ -1922,8 +1917,9 @@ static int cbq_delete(struct Qdisc *sch, unsigned long arg)
sch_tree_lock(sch);
qlen = cl->q->q.qlen;
+ backlog = cl->q->qstats.backlog;
qdisc_reset(cl->q);
- qdisc_tree_decrease_qlen(cl->q, qlen);
+ qdisc_tree_reduce_backlog(cl->q, qlen, backlog);
if (cl->next_alive)
cbq_deactivate_class(cl);
diff --git a/net/sched/sch_choke.c b/net/sched/sch_choke.c
index 5ffb8b8337c7..0a08c860eee4 100644
--- a/net/sched/sch_choke.c
+++ b/net/sched/sch_choke.c
@@ -128,8 +128,8 @@ static void choke_drop_by_idx(struct Qdisc *sch, unsigned int idx)
choke_zap_tail_holes(q);
qdisc_qstats_backlog_dec(sch, skb);
+ qdisc_tree_reduce_backlog(sch, 1, qdisc_pkt_len(skb));
qdisc_drop(skb, sch);
- qdisc_tree_decrease_qlen(sch, 1);
--sch->q.qlen;
}
@@ -456,6 +456,7 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
old = q->tab;
if (old) {
unsigned int oqlen = sch->q.qlen, tail = 0;
+ unsigned dropped = 0;
while (q->head != q->tail) {
struct sk_buff *skb = q->tab[q->head];
@@ -467,11 +468,12 @@ static int choke_change(struct Qdisc *sch, struct nlattr *opt)
ntab[tail++] = skb;
continue;
}
+ dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
--sch->q.qlen;
qdisc_drop(skb, sch);
}
- qdisc_tree_decrease_qlen(sch, oqlen - sch->q.qlen);
+ qdisc_tree_reduce_backlog(sch, oqlen - sch->q.qlen, dropped);
q->head = 0;
q->tail = tail;
}
diff --git a/net/sched/sch_codel.c b/net/sched/sch_codel.c
index 535007d5f0b5..9b7e2980ee5c 100644
--- a/net/sched/sch_codel.c
+++ b/net/sched/sch_codel.c
@@ -79,12 +79,13 @@ static struct sk_buff *codel_qdisc_dequeue(struct Qdisc *sch)
skb = codel_dequeue(sch, &q->params, &q->vars, &q->stats, dequeue);
- /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
+ /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
* or HTB crashes. Defer it for next round.
*/
if (q->stats.drop_count && sch->q.qlen) {
- qdisc_tree_decrease_qlen(sch, q->stats.drop_count);
+ qdisc_tree_reduce_backlog(sch, q->stats.drop_count, q->stats.drop_len);
q->stats.drop_count = 0;
+ q->stats.drop_len = 0;
}
if (skb)
qdisc_bstats_update(sch, skb);
@@ -116,7 +117,7 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
{
struct codel_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_CODEL_MAX + 1];
- unsigned int qlen;
+ unsigned int qlen, dropped = 0;
int err;
if (!opt)
@@ -156,10 +157,11 @@ static int codel_change(struct Qdisc *sch, struct nlattr *opt)
while (sch->q.qlen > sch->limit) {
struct sk_buff *skb = __skb_dequeue(&sch->q);
+ dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
qdisc_drop(skb, sch);
}
- qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+ qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
sch_tree_unlock(sch);
return 0;
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index f26bdea875c1..a63e879e8975 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -53,9 +53,10 @@ static struct drr_class *drr_find_class(struct Qdisc *sch, u32 classid)
static void drr_purge_queue(struct drr_class *cl)
{
unsigned int len = cl->qdisc->q.qlen;
+ unsigned int backlog = cl->qdisc->qstats.backlog;
qdisc_reset(cl->qdisc);
- qdisc_tree_decrease_qlen(cl->qdisc, len);
+ qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
}
static const struct nla_policy drr_policy[TCA_DRR_MAX + 1] = {
@@ -226,11 +227,7 @@ static int drr_graft_class(struct Qdisc *sch, unsigned long arg,
new = &noop_qdisc;
}
- sch_tree_lock(sch);
- drr_purge_queue(cl);
- *old = cl->qdisc;
- cl->qdisc = new;
- sch_tree_unlock(sch);
+ *old = qdisc_replace(sch, new, &cl->qdisc);
return 0;
}
@@ -403,6 +400,8 @@ static struct sk_buff *drr_dequeue(struct Qdisc *sch)
if (len <= cl->deficit) {
cl->deficit -= len;
skb = qdisc_dequeue_peeked(cl->qdisc);
+ if (unlikely(skb == NULL))
+ goto out;
if (cl->qdisc->q.qlen == 0)
list_del(&cl->alist);
diff --git a/net/sched/sch_dsmark.c b/net/sched/sch_dsmark.c
index f357f34d02d2..34b4ddaca27c 100644
--- a/net/sched/sch_dsmark.c
+++ b/net/sched/sch_dsmark.c
@@ -73,13 +73,7 @@ static int dsmark_graft(struct Qdisc *sch, unsigned long arg,
new = &noop_qdisc;
}
- sch_tree_lock(sch);
- *old = p->q;
- p->q = new;
- qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
- qdisc_reset(*old);
- sch_tree_unlock(sch);
-
+ *old = qdisc_replace(sch, new, &p->q);
return 0;
}
@@ -264,6 +258,7 @@ static int dsmark_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return err;
}
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
@@ -281,11 +276,12 @@ static struct sk_buff *dsmark_dequeue(struct Qdisc *sch)
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
- skb = p->q->ops->dequeue(p->q);
+ skb = qdisc_dequeue_peeked(p->q);
if (skb == NULL)
return NULL;
qdisc_bstats_update(sch, skb);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
index = skb->tc_index & (p->indices - 1);
@@ -401,6 +397,7 @@ static void dsmark_reset(struct Qdisc *sch)
pr_debug("%s(sch %p,[qdisc %p])\n", __func__, sch, p);
qdisc_reset(p->q);
+ sch->qstats.backlog = 0;
sch->q.qlen = 0;
}
diff --git a/net/sched/sch_fq.c b/net/sched/sch_fq.c
index 109b2322778f..3c6a47d66a04 100644
--- a/net/sched/sch_fq.c
+++ b/net/sched/sch_fq.c
@@ -662,6 +662,7 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
struct fq_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_FQ_MAX + 1];
int err, drop_count = 0;
+ unsigned drop_len = 0;
u32 fq_log;
if (!opt)
@@ -736,10 +737,11 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
if (!skb)
break;
+ drop_len += qdisc_pkt_len(skb);
kfree_skb(skb);
drop_count++;
}
- qdisc_tree_decrease_qlen(sch, drop_count);
+ qdisc_tree_reduce_backlog(sch, drop_count, drop_len);
sch_tree_unlock(sch);
return err;
diff --git a/net/sched/sch_fq_codel.c b/net/sched/sch_fq_codel.c
index 4c834e93dafb..d3fc8f9dd3d4 100644
--- a/net/sched/sch_fq_codel.c
+++ b/net/sched/sch_fq_codel.c
@@ -175,7 +175,7 @@ static unsigned int fq_codel_qdisc_drop(struct Qdisc *sch)
static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct fq_codel_sched_data *q = qdisc_priv(sch);
- unsigned int idx;
+ unsigned int idx, prev_backlog;
struct fq_codel_flow *flow;
int uninitialized_var(ret);
@@ -203,6 +203,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (++sch->q.qlen <= sch->limit)
return NET_XMIT_SUCCESS;
+ prev_backlog = sch->qstats.backlog;
q->drop_overlimit++;
/* Return Congestion Notification only if we dropped a packet
* from this flow.
@@ -211,7 +212,7 @@ static int fq_codel_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return NET_XMIT_CN;
/* As we dropped a packet, better let upper stack know this */
- qdisc_tree_decrease_qlen(sch, 1);
+ qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog);
return NET_XMIT_SUCCESS;
}
@@ -241,6 +242,7 @@ static struct sk_buff *fq_codel_dequeue(struct Qdisc *sch)
struct fq_codel_flow *flow;
struct list_head *head;
u32 prev_drop_count, prev_ecn_mark;
+ unsigned int prev_backlog;
begin:
head = &q->new_flows;
@@ -259,6 +261,7 @@ begin:
prev_drop_count = q->cstats.drop_count;
prev_ecn_mark = q->cstats.ecn_mark;
+ prev_backlog = sch->qstats.backlog;
skb = codel_dequeue(sch, &q->cparams, &flow->cvars, &q->cstats,
dequeue);
@@ -276,12 +279,14 @@ begin:
}
qdisc_bstats_update(sch, skb);
flow->deficit -= qdisc_pkt_len(skb);
- /* We cant call qdisc_tree_decrease_qlen() if our qlen is 0,
+ /* We cant call qdisc_tree_reduce_backlog() if our qlen is 0,
* or HTB crashes. Defer it for next round.
*/
if (q->cstats.drop_count && sch->q.qlen) {
- qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+ qdisc_tree_reduce_backlog(sch, q->cstats.drop_count,
+ q->cstats.drop_len);
q->cstats.drop_count = 0;
+ q->cstats.drop_len = 0;
}
return skb;
}
@@ -372,11 +377,13 @@ static int fq_codel_change(struct Qdisc *sch, struct nlattr *opt)
while (sch->q.qlen > sch->limit) {
struct sk_buff *skb = fq_codel_dequeue(sch);
+ q->cstats.drop_len += qdisc_pkt_len(skb);
kfree_skb(skb);
q->cstats.drop_count++;
}
- qdisc_tree_decrease_qlen(sch, q->cstats.drop_count);
+ qdisc_tree_reduce_backlog(sch, q->cstats.drop_count, q->cstats.drop_len);
q->cstats.drop_count = 0;
+ q->cstats.drop_len = 0;
sch_tree_unlock(sch);
return 0;
diff --git a/net/sched/sch_generic.c b/net/sched/sch_generic.c
index cb5d4ad32946..f18c35024207 100644
--- a/net/sched/sch_generic.c
+++ b/net/sched/sch_generic.c
@@ -567,6 +567,7 @@ struct Qdisc_ops pfifo_fast_ops __read_mostly = {
.dump = pfifo_fast_dump,
.owner = THIS_MODULE,
};
+EXPORT_SYMBOL(pfifo_fast_ops);
static struct lock_class_key qdisc_tx_busylock;
@@ -658,8 +659,10 @@ static void qdisc_rcu_free(struct rcu_head *head)
{
struct Qdisc *qdisc = container_of(head, struct Qdisc, rcu_head);
- if (qdisc_is_percpu_stats(qdisc))
+ if (qdisc_is_percpu_stats(qdisc)) {
free_percpu(qdisc->cpu_bstats);
+ free_percpu(qdisc->cpu_qstats);
+ }
kfree((char *) qdisc - qdisc->padded);
}
@@ -737,7 +740,7 @@ static void attach_one_default_qdisc(struct net_device *dev,
return;
}
if (!netif_is_multiqueue(dev))
- qdisc->flags |= TCQ_F_ONETXQUEUE;
+ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
dev_queue->qdisc_sleeping = qdisc;
}
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index b7ebe2c87586..d783d7cc3348 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -895,9 +895,10 @@ static void
hfsc_purge_queue(struct Qdisc *sch, struct hfsc_class *cl)
{
unsigned int len = cl->qdisc->q.qlen;
+ unsigned int backlog = cl->qdisc->qstats.backlog;
qdisc_reset(cl->qdisc);
- qdisc_tree_decrease_qlen(cl->qdisc, len);
+ qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
}
static void
@@ -1215,11 +1216,7 @@ hfsc_graft_class(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
new = &noop_qdisc;
}
- sch_tree_lock(sch);
- hfsc_purge_queue(sch, cl);
- *old = cl->qdisc;
- cl->qdisc = new;
- sch_tree_unlock(sch);
+ *old = qdisc_replace(sch, new, &cl->qdisc);
return 0;
}
diff --git a/net/sched/sch_hhf.c b/net/sched/sch_hhf.c
index 86b04e31e60b..13d6f83ec491 100644
--- a/net/sched/sch_hhf.c
+++ b/net/sched/sch_hhf.c
@@ -382,6 +382,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
struct hhf_sched_data *q = qdisc_priv(sch);
enum wdrr_bucket_idx idx;
struct wdrr_bucket *bucket;
+ unsigned int prev_backlog;
idx = hhf_classify(skb, sch);
@@ -409,6 +410,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
if (++sch->q.qlen <= sch->limit)
return NET_XMIT_SUCCESS;
+ prev_backlog = sch->qstats.backlog;
q->drop_overlimit++;
/* Return Congestion Notification only if we dropped a packet from this
* bucket.
@@ -417,7 +419,7 @@ static int hhf_enqueue(struct sk_buff *skb, struct Qdisc *sch)
return NET_XMIT_CN;
/* As we dropped a packet, better let upper stack know this. */
- qdisc_tree_decrease_qlen(sch, 1);
+ qdisc_tree_reduce_backlog(sch, 1, prev_backlog - sch->qstats.backlog);
return NET_XMIT_SUCCESS;
}
@@ -527,7 +529,7 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
{
struct hhf_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_HHF_MAX + 1];
- unsigned int qlen;
+ unsigned int qlen, prev_backlog;
int err;
u64 non_hh_quantum;
u32 new_quantum = q->quantum;
@@ -577,12 +579,14 @@ static int hhf_change(struct Qdisc *sch, struct nlattr *opt)
}
qlen = sch->q.qlen;
+ prev_backlog = sch->qstats.backlog;
while (sch->q.qlen > sch->limit) {
struct sk_buff *skb = hhf_dequeue(sch);
kfree_skb(skb);
}
- qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+ qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen,
+ prev_backlog - sch->qstats.backlog);
sch_tree_unlock(sch);
return 0;
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 15ccd7f8fb2a..87b02ed3d5f2 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -600,6 +600,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch)
htb_activate(q, cl);
}
+ qdisc_qstats_backlog_inc(sch, skb);
sch->q.qlen++;
return NET_XMIT_SUCCESS;
}
@@ -889,6 +890,7 @@ static struct sk_buff *htb_dequeue(struct Qdisc *sch)
ok:
qdisc_bstats_update(sch, skb);
qdisc_unthrottled(sch);
+ qdisc_qstats_backlog_dec(sch, skb);
sch->q.qlen--;
return skb;
}
@@ -955,6 +957,7 @@ static unsigned int htb_drop(struct Qdisc *sch)
unsigned int len;
if (cl->un.leaf.q->ops->drop &&
(len = cl->un.leaf.q->ops->drop(cl->un.leaf.q))) {
+ sch->qstats.backlog -= len;
sch->q.qlen--;
if (!cl->un.leaf.q->q.qlen)
htb_deactivate(q, cl);
@@ -984,12 +987,12 @@ static void htb_reset(struct Qdisc *sch)
}
cl->prio_activity = 0;
cl->cmode = HTB_CAN_SEND;
-
}
}
qdisc_watchdog_cancel(&q->watchdog);
__skb_queue_purge(&q->direct_queue);
sch->q.qlen = 0;
+ sch->qstats.backlog = 0;
memset(q->hlevel, 0, sizeof(q->hlevel));
memset(q->row_mask, 0, sizeof(q->row_mask));
for (i = 0; i < TC_HTB_NUMPRIO; i++)
@@ -1163,14 +1166,7 @@ static int htb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
cl->common.classid)) == NULL)
return -ENOBUFS;
- sch_tree_lock(sch);
- *old = cl->un.leaf.q;
- cl->un.leaf.q = new;
- if (*old != NULL) {
- qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
- qdisc_reset(*old);
- }
- sch_tree_unlock(sch);
+ *old = qdisc_replace(sch, new, &cl->un.leaf.q);
return 0;
}
@@ -1272,7 +1268,6 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
{
struct htb_sched *q = qdisc_priv(sch);
struct htb_class *cl = (struct htb_class *)arg;
- unsigned int qlen;
struct Qdisc *new_q = NULL;
int last_child = 0;
@@ -1292,9 +1287,11 @@ static int htb_delete(struct Qdisc *sch, unsigned long arg)
sch_tree_lock(sch);
if (!cl->level) {
- qlen = cl->un.leaf.q->q.qlen;
+ unsigned int qlen = cl->un.leaf.q->q.qlen;
+ unsigned int backlog = cl->un.leaf.q->qstats.backlog;
+
qdisc_reset(cl->un.leaf.q);
- qdisc_tree_decrease_qlen(cl->un.leaf.q, qlen);
+ qdisc_tree_reduce_backlog(cl->un.leaf.q, qlen, backlog);
}
/* delete from hash and active; remainder in destroy_class */
@@ -1428,10 +1425,11 @@ static int htb_change_class(struct Qdisc *sch, u32 classid,
sch_tree_lock(sch);
if (parent && !parent->level) {
unsigned int qlen = parent->un.leaf.q->q.qlen;
+ unsigned int backlog = parent->un.leaf.q->qstats.backlog;
/* turn parent into inner node */
qdisc_reset(parent->un.leaf.q);
- qdisc_tree_decrease_qlen(parent->un.leaf.q, qlen);
+ qdisc_tree_reduce_backlog(parent->un.leaf.q, qlen, backlog);
qdisc_destroy(parent->un.leaf.q);
if (parent->prio_activity)
htb_deactivate(q, parent);
diff --git a/net/sched/sch_ingress.c b/net/sched/sch_ingress.c
index e7c648fa9dc3..10adbc617905 100644
--- a/net/sched/sch_ingress.c
+++ b/net/sched/sch_ingress.c
@@ -1,4 +1,5 @@
-/* net/sched/sch_ingress.c - Ingress qdisc
+/* net/sched/sch_ingress.c - Ingress and clsact qdisc
+ *
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License
* as published by the Free Software Foundation; either version
@@ -98,17 +99,100 @@ static struct Qdisc_ops ingress_qdisc_ops __read_mostly = {
.owner = THIS_MODULE,
};
+static unsigned long clsact_get(struct Qdisc *sch, u32 classid)
+{
+ switch (TC_H_MIN(classid)) {
+ case TC_H_MIN(TC_H_MIN_INGRESS):
+ case TC_H_MIN(TC_H_MIN_EGRESS):
+ return TC_H_MIN(classid);
+ default:
+ return 0;
+ }
+}
+
+static unsigned long clsact_bind_filter(struct Qdisc *sch,
+ unsigned long parent, u32 classid)
+{
+ return clsact_get(sch, classid);
+}
+
+static struct tcf_proto __rcu **clsact_find_tcf(struct Qdisc *sch,
+ unsigned long cl)
+{
+ struct net_device *dev = qdisc_dev(sch);
+
+ switch (cl) {
+ case TC_H_MIN(TC_H_MIN_INGRESS):
+ return &dev->ingress_cl_list;
+ case TC_H_MIN(TC_H_MIN_EGRESS):
+ return &dev->egress_cl_list;
+ default:
+ return NULL;
+ }
+}
+
+static int clsact_init(struct Qdisc *sch, struct nlattr *opt)
+{
+ net_inc_ingress_queue();
+ net_inc_egress_queue();
+
+ sch->flags |= TCQ_F_CPUSTATS;
+
+ return 0;
+}
+
+static void clsact_destroy(struct Qdisc *sch)
+{
+ struct net_device *dev = qdisc_dev(sch);
+
+ tcf_destroy_chain(&dev->ingress_cl_list);
+ tcf_destroy_chain(&dev->egress_cl_list);
+
+ net_dec_ingress_queue();
+ net_dec_egress_queue();
+}
+
+static const struct Qdisc_class_ops clsact_class_ops = {
+ .leaf = ingress_leaf,
+ .get = clsact_get,
+ .put = ingress_put,
+ .walk = ingress_walk,
+ .tcf_chain = clsact_find_tcf,
+ .bind_tcf = clsact_bind_filter,
+ .unbind_tcf = ingress_put,
+};
+
+static struct Qdisc_ops clsact_qdisc_ops __read_mostly = {
+ .cl_ops = &clsact_class_ops,
+ .id = "clsact",
+ .init = clsact_init,
+ .destroy = clsact_destroy,
+ .dump = ingress_dump,
+ .owner = THIS_MODULE,
+};
+
static int __init ingress_module_init(void)
{
- return register_qdisc(&ingress_qdisc_ops);
+ int ret;
+
+ ret = register_qdisc(&ingress_qdisc_ops);
+ if (!ret) {
+ ret = register_qdisc(&clsact_qdisc_ops);
+ if (ret)
+ unregister_qdisc(&ingress_qdisc_ops);
+ }
+
+ return ret;
}
static void __exit ingress_module_exit(void)
{
unregister_qdisc(&ingress_qdisc_ops);
+ unregister_qdisc(&clsact_qdisc_ops);
}
module_init(ingress_module_init);
module_exit(ingress_module_exit);
+MODULE_ALIAS("sch_clsact");
MODULE_LICENSE("GPL");
diff --git a/net/sched/sch_mq.c b/net/sched/sch_mq.c
index f3cbaecd283a..56a77b878eb3 100644
--- a/net/sched/sch_mq.c
+++ b/net/sched/sch_mq.c
@@ -57,13 +57,13 @@ static int mq_init(struct Qdisc *sch, struct nlattr *opt)
for (ntx = 0; ntx < dev->num_tx_queues; ntx++) {
dev_queue = netdev_get_tx_queue(dev, ntx);
- qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
+ qdisc = qdisc_create_dflt(dev_queue, get_default_qdisc_ops(dev, ntx),
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(ntx + 1)));
if (qdisc == NULL)
goto err;
priv->qdiscs[ntx] = qdisc;
- qdisc->flags |= TCQ_F_ONETXQUEUE;
+ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
}
sch->flags |= TCQ_F_MQROOT;
@@ -156,7 +156,7 @@ static int mq_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
*old = dev_graft_qdisc(dev_queue, new);
if (new)
- new->flags |= TCQ_F_ONETXQUEUE;
+ new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
if (dev->flags & IFF_UP)
dev_activate(dev);
return 0;
diff --git a/net/sched/sch_mqprio.c b/net/sched/sch_mqprio.c
index 3811a745452c..b8002ce3d010 100644
--- a/net/sched/sch_mqprio.c
+++ b/net/sched/sch_mqprio.c
@@ -28,6 +28,7 @@ static void mqprio_destroy(struct Qdisc *sch)
{
struct net_device *dev = qdisc_dev(sch);
struct mqprio_sched *priv = qdisc_priv(sch);
+ struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO};
unsigned int ntx;
if (priv->qdiscs) {
@@ -39,7 +40,7 @@ static void mqprio_destroy(struct Qdisc *sch)
}
if (priv->hw_owned && dev->netdev_ops->ndo_setup_tc)
- dev->netdev_ops->ndo_setup_tc(dev, 0);
+ dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc);
else
netdev_set_num_tc(dev, 0);
}
@@ -124,7 +125,8 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
for (i = 0; i < dev->num_tx_queues; i++) {
dev_queue = netdev_get_tx_queue(dev, i);
- qdisc = qdisc_create_dflt(dev_queue, default_qdisc_ops,
+ qdisc = qdisc_create_dflt(dev_queue,
+ get_default_qdisc_ops(dev, i),
TC_H_MAKE(TC_H_MAJ(sch->handle),
TC_H_MIN(i + 1)));
if (qdisc == NULL) {
@@ -132,7 +134,7 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
goto err;
}
priv->qdiscs[i] = qdisc;
- qdisc->flags |= TCQ_F_ONETXQUEUE;
+ qdisc->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
}
/* If the mqprio options indicate that hardware should own
@@ -140,8 +142,11 @@ static int mqprio_init(struct Qdisc *sch, struct nlattr *opt)
* supplied and verified mapping
*/
if (qopt->hw) {
+ struct tc_to_netdev tc = {.type = TC_SETUP_MQPRIO,
+ { .tc = qopt->num_tc }};
+
priv->hw_owned = 1;
- err = dev->netdev_ops->ndo_setup_tc(dev, qopt->num_tc);
+ err = dev->netdev_ops->ndo_setup_tc(dev, sch->handle, 0, &tc);
if (err)
goto err;
} else {
@@ -209,7 +214,7 @@ static int mqprio_graft(struct Qdisc *sch, unsigned long cl, struct Qdisc *new,
*old = dev_graft_qdisc(dev_queue, new);
if (new)
- new->flags |= TCQ_F_ONETXQUEUE;
+ new->flags |= TCQ_F_ONETXQUEUE | TCQ_F_NOPARENT;
if (dev->flags & IFF_UP)
dev_activate(dev);
diff --git a/net/sched/sch_multiq.c b/net/sched/sch_multiq.c
index 4e904ca0af9d..bcdd54bb101c 100644
--- a/net/sched/sch_multiq.c
+++ b/net/sched/sch_multiq.c
@@ -218,7 +218,8 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
if (q->queues[i] != &noop_qdisc) {
struct Qdisc *child = q->queues[i];
q->queues[i] = &noop_qdisc;
- qdisc_tree_decrease_qlen(child, child->q.qlen);
+ qdisc_tree_reduce_backlog(child, child->q.qlen,
+ child->qstats.backlog);
qdisc_destroy(child);
}
}
@@ -238,8 +239,9 @@ static int multiq_tune(struct Qdisc *sch, struct nlattr *opt)
q->queues[i] = child;
if (old != &noop_qdisc) {
- qdisc_tree_decrease_qlen(old,
- old->q.qlen);
+ qdisc_tree_reduce_backlog(old,
+ old->q.qlen,
+ old->qstats.backlog);
qdisc_destroy(old);
}
sch_tree_unlock(sch);
@@ -303,13 +305,7 @@ static int multiq_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
if (new == NULL)
new = &noop_qdisc;
- sch_tree_lock(sch);
- *old = q->queues[band];
- q->queues[band] = new;
- qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
- qdisc_reset(*old);
- sch_tree_unlock(sch);
-
+ *old = qdisc_replace(sch, new, &q->queues[band]);
return 0;
}
diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
index 5abd1d9de989..9640bb39a5d2 100644
--- a/net/sched/sch_netem.c
+++ b/net/sched/sch_netem.c
@@ -598,7 +598,8 @@ deliver:
if (unlikely(err != NET_XMIT_SUCCESS)) {
if (net_xmit_drop_count(err)) {
qdisc_qstats_drop(sch);
- qdisc_tree_decrease_qlen(sch, 1);
+ qdisc_tree_reduce_backlog(sch, 1,
+ qdisc_pkt_len(skb));
}
}
goto tfifo_dequeue;
@@ -1037,15 +1038,7 @@ static int netem_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
{
struct netem_sched_data *q = qdisc_priv(sch);
- sch_tree_lock(sch);
- *old = q->qdisc;
- q->qdisc = new;
- if (*old) {
- qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
- qdisc_reset(*old);
- }
- sch_tree_unlock(sch);
-
+ *old = qdisc_replace(sch, new, &q->qdisc);
return 0;
}
diff --git a/net/sched/sch_pie.c b/net/sched/sch_pie.c
index b783a446d884..71ae3b9629f9 100644
--- a/net/sched/sch_pie.c
+++ b/net/sched/sch_pie.c
@@ -183,7 +183,7 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt)
{
struct pie_sched_data *q = qdisc_priv(sch);
struct nlattr *tb[TCA_PIE_MAX + 1];
- unsigned int qlen;
+ unsigned int qlen, dropped = 0;
int err;
if (!opt)
@@ -232,10 +232,11 @@ static int pie_change(struct Qdisc *sch, struct nlattr *opt)
while (sch->q.qlen > sch->limit) {
struct sk_buff *skb = __skb_dequeue(&sch->q);
+ dropped += qdisc_pkt_len(skb);
qdisc_qstats_backlog_dec(sch, skb);
qdisc_drop(skb, sch);
}
- qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+ qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
sch_tree_unlock(sch);
return 0;
diff --git a/net/sched/sch_prio.c b/net/sched/sch_prio.c
index ba6487f2741f..fee1b15506b2 100644
--- a/net/sched/sch_prio.c
+++ b/net/sched/sch_prio.c
@@ -191,7 +191,7 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
struct Qdisc *child = q->queues[i];
q->queues[i] = &noop_qdisc;
if (child != &noop_qdisc) {
- qdisc_tree_decrease_qlen(child, child->q.qlen);
+ qdisc_tree_reduce_backlog(child, child->q.qlen, child->qstats.backlog);
qdisc_destroy(child);
}
}
@@ -210,8 +210,9 @@ static int prio_tune(struct Qdisc *sch, struct nlattr *opt)
q->queues[i] = child;
if (old != &noop_qdisc) {
- qdisc_tree_decrease_qlen(old,
- old->q.qlen);
+ qdisc_tree_reduce_backlog(old,
+ old->q.qlen,
+ old->qstats.backlog);
qdisc_destroy(old);
}
sch_tree_unlock(sch);
@@ -268,13 +269,7 @@ static int prio_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
if (new == NULL)
new = &noop_qdisc;
- sch_tree_lock(sch);
- *old = q->queues[band];
- q->queues[band] = new;
- qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
- qdisc_reset(*old);
- sch_tree_unlock(sch);
-
+ *old = qdisc_replace(sch, new, &q->queues[band]);
return 0;
}
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 3dc3a6e56052..8d2d8d953432 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -220,9 +220,10 @@ static struct qfq_class *qfq_find_class(struct Qdisc *sch, u32 classid)
static void qfq_purge_queue(struct qfq_class *cl)
{
unsigned int len = cl->qdisc->q.qlen;
+ unsigned int backlog = cl->qdisc->qstats.backlog;
qdisc_reset(cl->qdisc);
- qdisc_tree_decrease_qlen(cl->qdisc, len);
+ qdisc_tree_reduce_backlog(cl->qdisc, len, backlog);
}
static const struct nla_policy qfq_policy[TCA_QFQ_MAX + 1] = {
@@ -617,11 +618,7 @@ static int qfq_graft_class(struct Qdisc *sch, unsigned long arg,
new = &noop_qdisc;
}
- sch_tree_lock(sch);
- qfq_purge_queue(cl);
- *old = cl->qdisc;
- cl->qdisc = new;
- sch_tree_unlock(sch);
+ *old = qdisc_replace(sch, new, &cl->qdisc);
return 0;
}
diff --git a/net/sched/sch_red.c b/net/sched/sch_red.c
index 6c0534cc7758..8c0508c0e287 100644
--- a/net/sched/sch_red.c
+++ b/net/sched/sch_red.c
@@ -210,7 +210,8 @@ static int red_change(struct Qdisc *sch, struct nlattr *opt)
q->flags = ctl->flags;
q->limit = ctl->limit;
if (child) {
- qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+ qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
+ q->qdisc->qstats.backlog);
qdisc_destroy(q->qdisc);
q->qdisc = child;
}
@@ -313,12 +314,7 @@ static int red_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
if (new == NULL)
new = &noop_qdisc;
- sch_tree_lock(sch);
- *old = q->qdisc;
- q->qdisc = new;
- qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
- qdisc_reset(*old);
- sch_tree_unlock(sch);
+ *old = qdisc_replace(sch, new, &q->qdisc);
return 0;
}
diff --git a/net/sched/sch_sfb.c b/net/sched/sch_sfb.c
index 5bbb6332ec57..c69611640fa5 100644
--- a/net/sched/sch_sfb.c
+++ b/net/sched/sch_sfb.c
@@ -510,7 +510,8 @@ static int sfb_change(struct Qdisc *sch, struct nlattr *opt)
sch_tree_lock(sch);
- qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+ qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
+ q->qdisc->qstats.backlog);
qdisc_destroy(q->qdisc);
q->qdisc = child;
@@ -606,12 +607,7 @@ static int sfb_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
if (new == NULL)
new = &noop_qdisc;
- sch_tree_lock(sch);
- *old = q->qdisc;
- q->qdisc = new;
- qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
- qdisc_reset(*old);
- sch_tree_unlock(sch);
+ *old = qdisc_replace(sch, new, &q->qdisc);
return 0;
}
diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index 3abab534eb5c..498f0a2cb47f 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -346,7 +346,7 @@ static int
sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch)
{
struct sfq_sched_data *q = qdisc_priv(sch);
- unsigned int hash;
+ unsigned int hash, dropped;
sfq_index x, qlen;
struct sfq_slot *slot;
int uninitialized_var(ret);
@@ -461,7 +461,7 @@ enqueue:
return NET_XMIT_SUCCESS;
qlen = slot->qlen;
- sfq_drop(sch);
+ dropped = sfq_drop(sch);
/* Return Congestion Notification only if we dropped a packet
* from this flow.
*/
@@ -469,7 +469,7 @@ enqueue:
return NET_XMIT_CN;
/* As we dropped a packet, better let upper stack know this */
- qdisc_tree_decrease_qlen(sch, 1);
+ qdisc_tree_reduce_backlog(sch, 1, dropped);
return NET_XMIT_SUCCESS;
}
@@ -537,6 +537,7 @@ static void sfq_rehash(struct Qdisc *sch)
struct sfq_slot *slot;
struct sk_buff_head list;
int dropped = 0;
+ unsigned int drop_len = 0;
__skb_queue_head_init(&list);
@@ -565,6 +566,7 @@ static void sfq_rehash(struct Qdisc *sch)
if (x >= SFQ_MAX_FLOWS) {
drop:
qdisc_qstats_backlog_dec(sch, skb);
+ drop_len += qdisc_pkt_len(skb);
kfree_skb(skb);
dropped++;
continue;
@@ -594,7 +596,7 @@ drop:
}
}
sch->q.qlen -= dropped;
- qdisc_tree_decrease_qlen(sch, dropped);
+ qdisc_tree_reduce_backlog(sch, dropped, drop_len);
}
static void sfq_perturbation(unsigned long arg)
@@ -618,7 +620,7 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
struct sfq_sched_data *q = qdisc_priv(sch);
struct tc_sfq_qopt *ctl = nla_data(opt);
struct tc_sfq_qopt_v1 *ctl_v1 = NULL;
- unsigned int qlen;
+ unsigned int qlen, dropped = 0;
struct red_parms *p = NULL;
if (opt->nla_len < nla_attr_size(sizeof(*ctl)))
@@ -667,8 +669,8 @@ static int sfq_change(struct Qdisc *sch, struct nlattr *opt)
qlen = sch->q.qlen;
while (sch->q.qlen > q->limit)
- sfq_drop(sch);
- qdisc_tree_decrease_qlen(sch, qlen - sch->q.qlen);
+ dropped += sfq_drop(sch);
+ qdisc_tree_reduce_backlog(sch, qlen - sch->q.qlen, dropped);
del_timer(&q->perturb_timer);
if (q->perturb_period) {
diff --git a/net/sched/sch_tbf.c b/net/sched/sch_tbf.c
index a4afde14e865..c2fbde742f37 100644
--- a/net/sched/sch_tbf.c
+++ b/net/sched/sch_tbf.c
@@ -160,6 +160,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
struct tbf_sched_data *q = qdisc_priv(sch);
struct sk_buff *segs, *nskb;
netdev_features_t features = netif_skb_features(skb);
+ unsigned int len = 0, prev_len = qdisc_pkt_len(skb);
int ret, nb;
segs = skb_gso_segment(skb, features & ~NETIF_F_GSO_MASK);
@@ -172,6 +173,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
nskb = segs->next;
segs->next = NULL;
qdisc_skb_cb(segs)->pkt_len = segs->len;
+ len += segs->len;
ret = qdisc_enqueue(segs, q->qdisc);
if (ret != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret))
@@ -183,7 +185,7 @@ static int tbf_segment(struct sk_buff *skb, struct Qdisc *sch)
}
sch->q.qlen += nb;
if (nb > 1)
- qdisc_tree_decrease_qlen(sch, 1 - nb);
+ qdisc_tree_reduce_backlog(sch, 1 - nb, prev_len - len);
consume_skb(skb);
return nb > 0 ? NET_XMIT_SUCCESS : NET_XMIT_DROP;
}
@@ -399,7 +401,8 @@ static int tbf_change(struct Qdisc *sch, struct nlattr *opt)
sch_tree_lock(sch);
if (child) {
- qdisc_tree_decrease_qlen(q->qdisc, q->qdisc->q.qlen);
+ qdisc_tree_reduce_backlog(q->qdisc, q->qdisc->q.qlen,
+ q->qdisc->qstats.backlog);
qdisc_destroy(q->qdisc);
q->qdisc = child;
}
@@ -502,13 +505,7 @@ static int tbf_graft(struct Qdisc *sch, unsigned long arg, struct Qdisc *new,
if (new == NULL)
new = &noop_qdisc;
- sch_tree_lock(sch);
- *old = q->qdisc;
- q->qdisc = new;
- qdisc_tree_decrease_qlen(*old, (*old)->q.qlen);
- qdisc_reset(*old);
- sch_tree_unlock(sch);
-
+ *old = qdisc_replace(sch, new, &q->qdisc);
return 0;
}
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index b00f1f9611d6..e1849f3714ad 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -383,6 +383,7 @@ void sctp_association_free(struct sctp_association *asoc)
list_for_each_safe(pos, temp, &asoc->peer.transport_addr_list) {
transport = list_entry(pos, struct sctp_transport, transports);
list_del_rcu(pos);
+ sctp_unhash_transport(transport);
sctp_transport_free(transport);
}
@@ -500,6 +501,8 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
/* Remove this peer from the list. */
list_del_rcu(&peer->transports);
+ /* Remove this peer from the transport hashtable */
+ sctp_unhash_transport(peer);
/* Get the first transport of asoc. */
pos = asoc->peer.transport_addr_list.next;
@@ -699,6 +702,8 @@ struct sctp_transport *sctp_assoc_add_peer(struct sctp_association *asoc,
/* Attach the remote transport to our asoc. */
list_add_tail_rcu(&peer->transports, &asoc->peer.transport_addr_list);
asoc->peer.transport_count++;
+ /* Add this peer into the transport hashtable */
+ sctp_hash_transport(peer);
/* If we do not yet have a primary path, set one. */
if (!asoc->peer.primary_path) {
@@ -1258,7 +1263,7 @@ static struct sctp_transport *sctp_trans_elect_best(struct sctp_transport *curr,
if (score_curr > score_best)
return curr;
else if (score_curr == score_best)
- return sctp_trans_elect_tie(curr, best);
+ return sctp_trans_elect_tie(best, curr);
else
return best;
}
@@ -1401,7 +1406,8 @@ void sctp_assoc_sync_pmtu(struct sock *sk, struct sctp_association *asoc)
list_for_each_entry(t, &asoc->peer.transport_addr_list,
transports) {
if (t->pmtu_pending && t->dst) {
- sctp_transport_update_pmtu(sk, t, dst_mtu(t->dst));
+ sctp_transport_update_pmtu(sk, t,
+ WORD_TRUNC(dst_mtu(t->dst)));
t->pmtu_pending = 0;
}
if (!pmtu || (t->pathmtu < pmtu))
@@ -1488,7 +1494,7 @@ void sctp_assoc_rwnd_increase(struct sctp_association *asoc, unsigned int len)
asoc->peer.sack_needed = 0;
- sctp_outq_tail(&asoc->outqueue, sack);
+ sctp_outq_tail(&asoc->outqueue, sack, GFP_ATOMIC);
/* Stop the SACK timer. */
timer = &asoc->timers[SCTP_EVENT_TIMEOUT_SACK];
@@ -1590,7 +1596,7 @@ int sctp_assoc_lookup_laddr(struct sctp_association *asoc,
/* Set an association id for a given association */
int sctp_assoc_set_id(struct sctp_association *asoc, gfp_t gfp)
{
- bool preload = !!(gfp & __GFP_WAIT);
+ bool preload = gfpflags_allow_blocking(gfp);
int ret;
/* If the id is already assigned, keep it. */
diff --git a/net/sctp/auth.c b/net/sctp/auth.c
index 4f15b7d730e1..912eb1685a5d 100644
--- a/net/sctp/auth.c
+++ b/net/sctp/auth.c
@@ -27,9 +27,9 @@
* Vlad Yasevich <vladislav.yasevich@hp.com>
*/
+#include <crypto/hash.h>
#include <linux/slab.h>
#include <linux/types.h>
-#include <linux/crypto.h>
#include <linux/scatterlist.h>
#include <net/sctp/sctp.h>
#include <net/sctp/auth.h>
@@ -448,7 +448,7 @@ struct sctp_shared_key *sctp_auth_get_shkey(
*/
int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp)
{
- struct crypto_hash *tfm = NULL;
+ struct crypto_shash *tfm = NULL;
__u16 id;
/* If AUTH extension is disabled, we are done */
@@ -462,9 +462,8 @@ int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp)
return 0;
/* Allocated the array of pointers to transorms */
- ep->auth_hmacs = kzalloc(
- sizeof(struct crypto_hash *) * SCTP_AUTH_NUM_HMACS,
- gfp);
+ ep->auth_hmacs = kzalloc(sizeof(struct crypto_shash *) *
+ SCTP_AUTH_NUM_HMACS, gfp);
if (!ep->auth_hmacs)
return -ENOMEM;
@@ -483,8 +482,7 @@ int sctp_auth_init_hmacs(struct sctp_endpoint *ep, gfp_t gfp)
continue;
/* Allocate the ID */
- tfm = crypto_alloc_hash(sctp_hmac_list[id].hmac_name, 0,
- CRYPTO_ALG_ASYNC);
+ tfm = crypto_alloc_shash(sctp_hmac_list[id].hmac_name, 0, 0);
if (IS_ERR(tfm))
goto out_err;
@@ -500,7 +498,7 @@ out_err:
}
/* Destroy the hmac tfm array */
-void sctp_auth_destroy_hmacs(struct crypto_hash *auth_hmacs[])
+void sctp_auth_destroy_hmacs(struct crypto_shash *auth_hmacs[])
{
int i;
@@ -508,8 +506,7 @@ void sctp_auth_destroy_hmacs(struct crypto_hash *auth_hmacs[])
return;
for (i = 0; i < SCTP_AUTH_NUM_HMACS; i++) {
- if (auth_hmacs[i])
- crypto_free_hash(auth_hmacs[i]);
+ crypto_free_shash(auth_hmacs[i]);
}
kfree(auth_hmacs);
}
@@ -709,8 +706,7 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
struct sctp_auth_chunk *auth,
gfp_t gfp)
{
- struct scatterlist sg;
- struct hash_desc desc;
+ struct crypto_shash *tfm;
struct sctp_auth_bytes *asoc_key;
__u16 key_id, hmac_id;
__u8 *digest;
@@ -742,16 +738,22 @@ void sctp_auth_calculate_hmac(const struct sctp_association *asoc,
/* set up scatter list */
end = skb_tail_pointer(skb);
- sg_init_one(&sg, auth, end - (unsigned char *)auth);
- desc.tfm = asoc->ep->auth_hmacs[hmac_id];
- desc.flags = 0;
+ tfm = asoc->ep->auth_hmacs[hmac_id];
digest = auth->auth_hdr.hmac;
- if (crypto_hash_setkey(desc.tfm, &asoc_key->data[0], asoc_key->len))
+ if (crypto_shash_setkey(tfm, &asoc_key->data[0], asoc_key->len))
goto free;
- crypto_hash_digest(&desc, &sg, sg.length, digest);
+ {
+ SHASH_DESC_ON_STACK(desc, tfm);
+
+ desc->tfm = tfm;
+ desc->flags = 0;
+ crypto_shash_digest(desc, (u8 *)auth,
+ end - (unsigned char *)auth, digest);
+ shash_desc_zero(desc);
+ }
free:
if (free_key)
@@ -809,8 +811,8 @@ int sctp_auth_ep_set_hmacs(struct sctp_endpoint *ep,
if (!has_sha1)
return -EINVAL;
- memcpy(ep->auth_hmacs_list->hmac_ids, &hmacs->shmac_idents[0],
- hmacs->shmac_num_idents * sizeof(__u16));
+ for (i = 0; i < hmacs->shmac_num_idents; i++)
+ ep->auth_hmacs_list->hmac_ids[i] = htons(hmacs->shmac_idents[i]);
ep->auth_hmacs_list->param_hdr.length = htons(sizeof(sctp_paramhdr_t) +
hmacs->shmac_num_idents * sizeof(__u16));
return 0;
diff --git a/net/sctp/bind_addr.c b/net/sctp/bind_addr.c
index 871cdf9567e6..401c60750b20 100644
--- a/net/sctp/bind_addr.c
+++ b/net/sctp/bind_addr.c
@@ -111,7 +111,8 @@ int sctp_bind_addr_dup(struct sctp_bind_addr *dest,
dest->port = src->port;
list_for_each_entry(addr, &src->address_list, list) {
- error = sctp_add_bind_addr(dest, &addr->a, 1, gfp);
+ error = sctp_add_bind_addr(dest, &addr->a, sizeof(addr->a),
+ 1, gfp);
if (error < 0)
break;
}
@@ -150,7 +151,7 @@ void sctp_bind_addr_free(struct sctp_bind_addr *bp)
/* Add an address to the bind address list in the SCTP_bind_addr structure. */
int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
- __u8 addr_state, gfp_t gfp)
+ int new_size, __u8 addr_state, gfp_t gfp)
{
struct sctp_sockaddr_entry *addr;
@@ -159,7 +160,7 @@ int sctp_add_bind_addr(struct sctp_bind_addr *bp, union sctp_addr *new,
if (!addr)
return -ENOMEM;
- memcpy(&addr->a, new, sizeof(*new));
+ memcpy(&addr->a, new, min_t(size_t, sizeof(*new), new_size));
/* Fix up the port if it has not yet been set.
* Both v4 and v6 have the port at the same offset.
@@ -291,7 +292,8 @@ int sctp_raw_to_bind_addrs(struct sctp_bind_addr *bp, __u8 *raw_addr_list,
}
af->from_addr_param(&addr, rawaddr, htons(port), 0);
- retval = sctp_add_bind_addr(bp, &addr, SCTP_ADDR_SRC, gfp);
+ retval = sctp_add_bind_addr(bp, &addr, sizeof(addr),
+ SCTP_ADDR_SRC, gfp);
if (retval) {
/* Can't finish building the list, clean up. */
sctp_bind_addr_clean(bp);
@@ -453,8 +455,8 @@ static int sctp_copy_one_addr(struct net *net, struct sctp_bind_addr *dest,
(((AF_INET6 == addr->sa.sa_family) &&
(flags & SCTP_ADDR6_ALLOWED) &&
(flags & SCTP_ADDR6_PEERSUPP))))
- error = sctp_add_bind_addr(dest, addr, SCTP_ADDR_SRC,
- gfp);
+ error = sctp_add_bind_addr(dest, addr, sizeof(*addr),
+ SCTP_ADDR_SRC, gfp);
}
return error;
diff --git a/net/sctp/chunk.c b/net/sctp/chunk.c
index a3380917f197..958ef5f33f4b 100644
--- a/net/sctp/chunk.c
+++ b/net/sctp/chunk.c
@@ -70,19 +70,6 @@ static struct sctp_datamsg *sctp_datamsg_new(gfp_t gfp)
return msg;
}
-void sctp_datamsg_free(struct sctp_datamsg *msg)
-{
- struct sctp_chunk *chunk;
-
- /* This doesn't have to be a _safe vairant because
- * sctp_chunk_free() only drops the refs.
- */
- list_for_each_entry(chunk, &msg->chunks, frag_list)
- sctp_chunk_free(chunk);
-
- sctp_datamsg_put(msg);
-}
-
/* Final destructruction of datamsg memory. */
static void sctp_datamsg_destroy(struct sctp_datamsg *msg)
{
@@ -273,7 +260,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
frag |= SCTP_DATA_SACK_IMM;
}
- chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag, 0);
+ chunk = sctp_make_datafrag_empty(asoc, sinfo, len, frag,
+ 0, GFP_KERNEL);
if (!chunk) {
err = -ENOMEM;
@@ -309,7 +297,8 @@ struct sctp_datamsg *sctp_datamsg_from_user(struct sctp_association *asoc,
(sinfo->sinfo_flags & SCTP_SACK_IMMEDIATELY))
frag |= SCTP_DATA_SACK_IMM;
- chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag, 0);
+ chunk = sctp_make_datafrag_empty(asoc, sinfo, over, frag,
+ 0, GFP_KERNEL);
if (!chunk) {
err = -ENOMEM;
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 9da76ba4d10f..9d494e35e7f9 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -42,7 +42,6 @@
#include <linux/slab.h>
#include <linux/in.h>
#include <linux/random.h> /* get_random_bytes() */
-#include <linux/crypto.h>
#include <net/sock.h>
#include <net/ipv6.h>
#include <net/sctp/sctp.h>
@@ -314,21 +313,16 @@ struct sctp_endpoint *sctp_endpoint_is_match(struct sctp_endpoint *ep,
}
/* Find the association that goes with this chunk.
- * We do a linear search of the associations for this endpoint.
- * We return the matching transport address too.
+ * We lookup the transport from hashtable at first, then get association
+ * through t->assoc.
*/
-static struct sctp_association *__sctp_endpoint_lookup_assoc(
+struct sctp_association *sctp_endpoint_lookup_assoc(
const struct sctp_endpoint *ep,
const union sctp_addr *paddr,
struct sctp_transport **transport)
{
struct sctp_association *asoc = NULL;
- struct sctp_association *tmp;
- struct sctp_transport *t = NULL;
- struct sctp_hashbucket *head;
- struct sctp_ep_common *epb;
- int hash;
- int rport;
+ struct sctp_transport *t;
*transport = NULL;
@@ -337,45 +331,16 @@ static struct sctp_association *__sctp_endpoint_lookup_assoc(
*/
if (!ep->base.bind_addr.port)
goto out;
+ t = sctp_epaddr_lookup_transport(ep, paddr);
+ if (!t)
+ goto out;
- rport = ntohs(paddr->v4.sin_port);
-
- hash = sctp_assoc_hashfn(sock_net(ep->base.sk), ep->base.bind_addr.port,
- rport);
- head = &sctp_assoc_hashtable[hash];
- read_lock(&head->lock);
- sctp_for_each_hentry(epb, &head->chain) {
- tmp = sctp_assoc(epb);
- if (tmp->ep != ep || rport != tmp->peer.port)
- continue;
-
- t = sctp_assoc_lookup_paddr(tmp, paddr);
- if (t) {
- asoc = tmp;
- *transport = t;
- break;
- }
- }
- read_unlock(&head->lock);
+ *transport = t;
+ asoc = t->asoc;
out:
return asoc;
}
-/* Lookup association on an endpoint based on a peer address. BH-safe. */
-struct sctp_association *sctp_endpoint_lookup_assoc(
- const struct sctp_endpoint *ep,
- const union sctp_addr *paddr,
- struct sctp_transport **transport)
-{
- struct sctp_association *asoc;
-
- local_bh_disable();
- asoc = __sctp_endpoint_lookup_assoc(ep, paddr, transport);
- local_bh_enable();
-
- return asoc;
-}
-
/* Look for any peeled off association from the endpoint that matches the
* given peer address.
*/
diff --git a/net/sctp/input.c b/net/sctp/input.c
index b6493b3f11a9..00b8445364e3 100644
--- a/net/sctp/input.c
+++ b/net/sctp/input.c
@@ -221,7 +221,7 @@ int sctp_rcv(struct sk_buff *skb)
goto discard_release;
/* Create an SCTP packet structure. */
- chunk = sctp_chunkify(skb, asoc, sk);
+ chunk = sctp_chunkify(skb, asoc, sk, GFP_ATOMIC);
if (!chunk)
goto discard_release;
SCTP_INPUT_CB(skb)->chunk = chunk;
@@ -606,7 +606,8 @@ void sctp_v4_err(struct sk_buff *skb, __u32 info)
/* PMTU discovery (RFC1191) */
if (ICMP_FRAG_NEEDED == code) {
- sctp_icmp_frag_needed(sk, asoc, transport, info);
+ sctp_icmp_frag_needed(sk, asoc, transport,
+ WORD_TRUNC(info));
goto out_unlock;
} else {
if (ICMP_PROT_UNREACH == code) {
@@ -782,65 +783,149 @@ hit:
return ep;
}
-/* Insert association into the hash table. */
-static void __sctp_hash_established(struct sctp_association *asoc)
+/* rhashtable for transport */
+struct sctp_hash_cmp_arg {
+ const struct sctp_endpoint *ep;
+ const union sctp_addr *laddr;
+ const union sctp_addr *paddr;
+ const struct net *net;
+};
+
+static inline int sctp_hash_cmp(struct rhashtable_compare_arg *arg,
+ const void *ptr)
{
- struct net *net = sock_net(asoc->base.sk);
- struct sctp_ep_common *epb;
- struct sctp_hashbucket *head;
+ const struct sctp_hash_cmp_arg *x = arg->key;
+ const struct sctp_transport *t = ptr;
+ struct sctp_association *asoc = t->asoc;
+ const struct net *net = x->net;
- epb = &asoc->base;
+ if (!sctp_cmp_addr_exact(&t->ipaddr, x->paddr))
+ return 1;
+ if (!net_eq(sock_net(asoc->base.sk), net))
+ return 1;
+ if (x->ep) {
+ if (x->ep != asoc->ep)
+ return 1;
+ } else {
+ if (x->laddr->v4.sin_port != htons(asoc->base.bind_addr.port))
+ return 1;
+ if (!sctp_bind_addr_match(&asoc->base.bind_addr,
+ x->laddr, sctp_sk(asoc->base.sk)))
+ return 1;
+ }
- /* Calculate which chain this entry will belong to. */
- epb->hashent = sctp_assoc_hashfn(net, epb->bind_addr.port,
- asoc->peer.port);
+ return 0;
+}
- head = &sctp_assoc_hashtable[epb->hashent];
+static inline u32 sctp_hash_obj(const void *data, u32 len, u32 seed)
+{
+ const struct sctp_transport *t = data;
+ const union sctp_addr *paddr = &t->ipaddr;
+ const struct net *net = sock_net(t->asoc->base.sk);
+ u16 lport = htons(t->asoc->base.bind_addr.port);
+ u32 addr;
+
+ if (paddr->sa.sa_family == AF_INET6)
+ addr = jhash(&paddr->v6.sin6_addr, 16, seed);
+ else
+ addr = paddr->v4.sin_addr.s_addr;
- write_lock(&head->lock);
- hlist_add_head(&epb->node, &head->chain);
- write_unlock(&head->lock);
+ return jhash_3words(addr, ((__u32)paddr->v4.sin_port) << 16 |
+ (__force __u32)lport, net_hash_mix(net), seed);
}
-/* Add an association to the hash. Local BH-safe. */
-void sctp_hash_established(struct sctp_association *asoc)
+static inline u32 sctp_hash_key(const void *data, u32 len, u32 seed)
{
- if (asoc->temp)
- return;
+ const struct sctp_hash_cmp_arg *x = data;
+ const union sctp_addr *paddr = x->paddr;
+ const struct net *net = x->net;
+ u16 lport;
+ u32 addr;
+
+ lport = x->ep ? htons(x->ep->base.bind_addr.port) :
+ x->laddr->v4.sin_port;
+ if (paddr->sa.sa_family == AF_INET6)
+ addr = jhash(&paddr->v6.sin6_addr, 16, seed);
+ else
+ addr = paddr->v4.sin_addr.s_addr;
- local_bh_disable();
- __sctp_hash_established(asoc);
- local_bh_enable();
+ return jhash_3words(addr, ((__u32)paddr->v4.sin_port) << 16 |
+ (__force __u32)lport, net_hash_mix(net), seed);
}
-/* Remove association from the hash table. */
-static void __sctp_unhash_established(struct sctp_association *asoc)
+static const struct rhashtable_params sctp_hash_params = {
+ .head_offset = offsetof(struct sctp_transport, node),
+ .hashfn = sctp_hash_key,
+ .obj_hashfn = sctp_hash_obj,
+ .obj_cmpfn = sctp_hash_cmp,
+ .automatic_shrinking = true,
+};
+
+int sctp_transport_hashtable_init(void)
{
- struct net *net = sock_net(asoc->base.sk);
- struct sctp_hashbucket *head;
- struct sctp_ep_common *epb;
+ return rhashtable_init(&sctp_transport_hashtable, &sctp_hash_params);
+}
- epb = &asoc->base;
+void sctp_transport_hashtable_destroy(void)
+{
+ rhashtable_destroy(&sctp_transport_hashtable);
+}
- epb->hashent = sctp_assoc_hashfn(net, epb->bind_addr.port,
- asoc->peer.port);
+void sctp_hash_transport(struct sctp_transport *t)
+{
+ struct sctp_hash_cmp_arg arg;
- head = &sctp_assoc_hashtable[epb->hashent];
+ if (t->asoc->temp)
+ return;
- write_lock(&head->lock);
- hlist_del_init(&epb->node);
- write_unlock(&head->lock);
+ arg.ep = t->asoc->ep;
+ arg.paddr = &t->ipaddr;
+ arg.net = sock_net(t->asoc->base.sk);
+
+reinsert:
+ if (rhashtable_lookup_insert_key(&sctp_transport_hashtable, &arg,
+ &t->node, sctp_hash_params) == -EBUSY)
+ goto reinsert;
}
-/* Remove association from the hash table. Local BH-safe. */
-void sctp_unhash_established(struct sctp_association *asoc)
+void sctp_unhash_transport(struct sctp_transport *t)
{
- if (asoc->temp)
+ if (t->asoc->temp)
return;
- local_bh_disable();
- __sctp_unhash_established(asoc);
- local_bh_enable();
+ rhashtable_remove_fast(&sctp_transport_hashtable, &t->node,
+ sctp_hash_params);
+}
+
+struct sctp_transport *sctp_addrs_lookup_transport(
+ struct net *net,
+ const union sctp_addr *laddr,
+ const union sctp_addr *paddr)
+{
+ struct sctp_hash_cmp_arg arg = {
+ .ep = NULL,
+ .laddr = laddr,
+ .paddr = paddr,
+ .net = net,
+ };
+
+ return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg,
+ sctp_hash_params);
+}
+
+struct sctp_transport *sctp_epaddr_lookup_transport(
+ const struct sctp_endpoint *ep,
+ const union sctp_addr *paddr)
+{
+ struct net *net = sock_net(ep->base.sk);
+ struct sctp_hash_cmp_arg arg = {
+ .ep = ep,
+ .paddr = paddr,
+ .net = net,
+ };
+
+ return rhashtable_lookup_fast(&sctp_transport_hashtable, &arg,
+ sctp_hash_params);
}
/* Look up an association. */
@@ -850,38 +935,24 @@ static struct sctp_association *__sctp_lookup_association(
const union sctp_addr *peer,
struct sctp_transport **pt)
{
- struct sctp_hashbucket *head;
- struct sctp_ep_common *epb;
- struct sctp_association *asoc;
- struct sctp_transport *transport;
- int hash;
+ struct sctp_transport *t;
+ struct sctp_association *asoc = NULL;
- /* Optimize here for direct hit, only listening connections can
- * have wildcards anyways.
- */
- hash = sctp_assoc_hashfn(net, ntohs(local->v4.sin_port),
- ntohs(peer->v4.sin_port));
- head = &sctp_assoc_hashtable[hash];
- read_lock(&head->lock);
- sctp_for_each_hentry(epb, &head->chain) {
- asoc = sctp_assoc(epb);
- transport = sctp_assoc_is_match(asoc, net, local, peer);
- if (transport)
- goto hit;
- }
+ t = sctp_addrs_lookup_transport(net, local, peer);
+ if (!t || !sctp_transport_hold(t))
+ goto out;
- read_unlock(&head->lock);
+ asoc = t->asoc;
+ sctp_association_hold(asoc);
+ *pt = t;
- return NULL;
+ sctp_transport_put(t);
-hit:
- *pt = transport;
- sctp_association_hold(asoc);
- read_unlock(&head->lock);
+out:
return asoc;
}
-/* Look up an association. BH-safe. */
+/* Look up an association. protected by RCU read lock */
static
struct sctp_association *sctp_lookup_association(struct net *net,
const union sctp_addr *laddr,
@@ -890,9 +961,9 @@ struct sctp_association *sctp_lookup_association(struct net *net,
{
struct sctp_association *asoc;
- local_bh_disable();
+ rcu_read_lock();
asoc = __sctp_lookup_association(net, laddr, paddr, transportp);
- local_bh_enable();
+ rcu_read_unlock();
return asoc;
}
diff --git a/net/sctp/ipv6.c b/net/sctp/ipv6.c
index e917d27328ea..ce46f1c7f133 100644
--- a/net/sctp/ipv6.c
+++ b/net/sctp/ipv6.c
@@ -209,6 +209,7 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
struct sock *sk = skb->sk;
struct ipv6_pinfo *np = inet6_sk(sk);
struct flowi6 *fl6 = &transport->fl.u.ip6;
+ int res;
pr_debug("%s: skb:%p, len:%d, src:%pI6 dst:%pI6\n", __func__, skb,
skb->len, &fl6->saddr, &fl6->daddr);
@@ -220,7 +221,10 @@ static int sctp_v6_xmit(struct sk_buff *skb, struct sctp_transport *transport)
SCTP_INC_STATS(sock_net(sk), SCTP_MIB_OUTSCTPPACKS);
- return ip6_xmit(sk, skb, fl6, np->opt, np->tclass);
+ rcu_read_lock();
+ res = ip6_xmit(sk, skb, fl6, rcu_dereference(np->opt), np->tclass);
+ rcu_read_unlock();
+ return res;
}
/* Returns the dst cache entry for the given source and destination ip
@@ -262,7 +266,10 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
pr_debug("src=%pI6 - ", &fl6->saddr);
}
- final_p = fl6_update_dst(fl6, np->opt, &final);
+ rcu_read_lock();
+ final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
+ rcu_read_unlock();
+
dst = ip6_dst_lookup_flow(sk, fl6, final_p);
if (!asoc || saddr)
goto out;
@@ -316,14 +323,13 @@ static void sctp_v6_get_dst(struct sctp_transport *t, union sctp_addr *saddr,
}
}
}
- rcu_read_unlock();
-
if (baddr) {
fl6->saddr = baddr->v6.sin6_addr;
fl6->fl6_sport = baddr->v6.sin6_port;
- final_p = fl6_update_dst(fl6, np->opt, &final);
+ final_p = fl6_update_dst(fl6, rcu_dereference(np->opt), &final);
dst = ip6_dst_lookup_flow(sk, fl6, final_p);
}
+ rcu_read_unlock();
out:
if (!IS_ERR_OR_NULL(dst)) {
@@ -520,6 +526,8 @@ static int sctp_v6_cmp_addr(const union sctp_addr *addr1,
}
return 0;
}
+ if (addr1->v6.sin6_port != addr2->v6.sin6_port)
+ return 0;
if (!ipv6_addr_equal(&addr1->v6.sin6_addr, &addr2->v6.sin6_addr))
return 0;
/* If this is a linklocal address, compare the scope_id. */
@@ -635,6 +643,7 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
struct sock *newsk;
struct ipv6_pinfo *newnp, *np = inet6_sk(sk);
struct sctp6_sock *newsctp6sk;
+ struct ipv6_txoptions *opt;
newsk = sk_alloc(sock_net(sk), PF_INET6, GFP_KERNEL, sk->sk_prot, 0);
if (!newsk)
@@ -654,6 +663,13 @@ static struct sock *sctp_v6_create_accept_sk(struct sock *sk,
memcpy(newnp, np, sizeof(struct ipv6_pinfo));
+ rcu_read_lock();
+ opt = rcu_dereference(np->opt);
+ if (opt)
+ opt = ipv6_dup_options(newsk, opt);
+ RCU_INIT_POINTER(newnp->opt, opt);
+ rcu_read_unlock();
+
/* Initialize sk's sport, dport, rcv_saddr and daddr for getsockname()
* and getpeername().
*/
diff --git a/net/sctp/output.c b/net/sctp/output.c
index abe7c2db2412..9844fe573029 100644
--- a/net/sctp/output.c
+++ b/net/sctp/output.c
@@ -153,7 +153,7 @@ void sctp_packet_free(struct sctp_packet *packet)
*/
sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
struct sctp_chunk *chunk,
- int one_packet)
+ int one_packet, gfp_t gfp)
{
sctp_xmit_t retval;
int error = 0;
@@ -163,7 +163,7 @@ sctp_xmit_t sctp_packet_transmit_chunk(struct sctp_packet *packet,
switch ((retval = (sctp_packet_append_chunk(packet, chunk)))) {
case SCTP_XMIT_PMTU_FULL:
if (!packet->has_cookie_echo) {
- error = sctp_packet_transmit(packet);
+ error = sctp_packet_transmit(packet, gfp);
if (error < 0)
chunk->skb->sk->sk_err = -error;
@@ -376,7 +376,7 @@ static void sctp_packet_set_owner_w(struct sk_buff *skb, struct sock *sk)
*
* The return value is a normal kernel error return value.
*/
-int sctp_packet_transmit(struct sctp_packet *packet)
+int sctp_packet_transmit(struct sctp_packet *packet, gfp_t gfp)
{
struct sctp_transport *tp = packet->transport;
struct sctp_association *asoc = tp->asoc;
@@ -401,7 +401,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
sk = chunk->skb->sk;
/* Allocate the new skb. */
- nskb = alloc_skb(packet->size + MAX_HEADER, GFP_ATOMIC);
+ nskb = alloc_skb(packet->size + MAX_HEADER, gfp);
if (!nskb)
goto nomem;
@@ -523,8 +523,8 @@ int sctp_packet_transmit(struct sctp_packet *packet)
*/
if (auth)
sctp_auth_calculate_hmac(asoc, nskb,
- (struct sctp_auth_chunk *)auth,
- GFP_ATOMIC);
+ (struct sctp_auth_chunk *)auth,
+ gfp);
/* 2) Calculate the Adler-32 checksum of the whole packet,
* including the SCTP common header and all the
@@ -534,7 +534,7 @@ int sctp_packet_transmit(struct sctp_packet *packet)
* by CRC32-C as described in <draft-ietf-tsvwg-sctpcsum-02.txt>.
*/
if (!sctp_checksum_disable) {
- if (!(dst->dev->features & NETIF_F_SCTP_CSUM) ||
+ if (!(dst->dev->features & NETIF_F_SCTP_CRC) ||
(dst_xfrm(dst) != NULL) || packet->ipfragok) {
sh->checksum = sctp_compute_cksum(nskb, 0);
} else {
@@ -705,7 +705,8 @@ static sctp_xmit_t sctp_packet_can_append_data(struct sctp_packet *packet,
/* Check whether this chunk and all the rest of pending data will fit
* or delay in hopes of bundling a full sized packet.
*/
- if (chunk->skb->len + q->out_qlen >= transport->pathmtu - packet->overhead)
+ if (chunk->skb->len + q->out_qlen >
+ transport->pathmtu - packet->overhead - sizeof(sctp_data_chunk_t) - 4)
/* Enough data queued to fill a packet */
return SCTP_XMIT_OK;
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 7e8f0a117106..8d3d3625130e 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -68,7 +68,7 @@ static void sctp_mark_missing(struct sctp_outq *q,
static void sctp_generate_fwdtsn(struct sctp_outq *q, __u32 sack_ctsn);
-static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout);
+static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp);
/* Add data to the front of the queue. */
static inline void sctp_outq_head_data(struct sctp_outq *q,
@@ -285,7 +285,7 @@ void sctp_outq_free(struct sctp_outq *q)
}
/* Put a new chunk in an sctp_outq. */
-int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
+int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk, gfp_t gfp)
{
struct net *net = sock_net(q->asoc->base.sk);
int error = 0;
@@ -324,6 +324,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
sctp_cname(SCTP_ST_CHUNK(chunk->chunk_hdr->type)) :
"illegal chunk");
+ sctp_chunk_hold(chunk);
sctp_outq_tail_data(q, chunk);
if (chunk->chunk_hdr->flags & SCTP_DATA_UNORDERED)
SCTP_INC_STATS(net, SCTP_MIB_OUTUNORDERCHUNKS);
@@ -340,7 +341,7 @@ int sctp_outq_tail(struct sctp_outq *q, struct sctp_chunk *chunk)
return error;
if (!q->cork)
- error = sctp_outq_flush(q, 0);
+ error = sctp_outq_flush(q, 0, gfp);
return error;
}
@@ -509,7 +510,7 @@ void sctp_retransmit(struct sctp_outq *q, struct sctp_transport *transport,
* will be flushed at the end.
*/
if (reason != SCTP_RTXR_FAST_RTX)
- error = sctp_outq_flush(q, /* rtx_timeout */ 1);
+ error = sctp_outq_flush(q, /* rtx_timeout */ 1, GFP_ATOMIC);
if (error)
q->asoc->base.sk->sk_err = -error;
@@ -600,12 +601,12 @@ redo:
* control chunks are already freed so there
* is nothing we can do.
*/
- sctp_packet_transmit(pkt);
+ sctp_packet_transmit(pkt, GFP_ATOMIC);
goto redo;
}
/* Send this packet. */
- error = sctp_packet_transmit(pkt);
+ error = sctp_packet_transmit(pkt, GFP_ATOMIC);
/* If we are retransmitting, we should only
* send a single packet.
@@ -621,7 +622,7 @@ redo:
case SCTP_XMIT_RWND_FULL:
/* Send this packet. */
- error = sctp_packet_transmit(pkt);
+ error = sctp_packet_transmit(pkt, GFP_ATOMIC);
/* Stop sending DATA as there is no more room
* at the receiver.
@@ -631,7 +632,7 @@ redo:
case SCTP_XMIT_DELAY:
/* Send this packet. */
- error = sctp_packet_transmit(pkt);
+ error = sctp_packet_transmit(pkt, GFP_ATOMIC);
/* Stop sending DATA because of nagle delay. */
done = 1;
@@ -684,12 +685,12 @@ redo:
}
/* Cork the outqueue so queued chunks are really queued. */
-int sctp_outq_uncork(struct sctp_outq *q)
+int sctp_outq_uncork(struct sctp_outq *q, gfp_t gfp)
{
if (q->cork)
q->cork = 0;
- return sctp_outq_flush(q, 0);
+ return sctp_outq_flush(q, 0, gfp);
}
@@ -702,7 +703,7 @@ int sctp_outq_uncork(struct sctp_outq *q)
* locking concerns must be made. Today we use the sock lock to protect
* this function.
*/
-static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
+static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout, gfp_t gfp)
{
struct sctp_packet *packet;
struct sctp_packet singleton;
@@ -824,7 +825,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
sctp_packet_init(&singleton, transport, sport, dport);
sctp_packet_config(&singleton, vtag, 0);
sctp_packet_append_chunk(&singleton, chunk);
- error = sctp_packet_transmit(&singleton);
+ error = sctp_packet_transmit(&singleton, gfp);
if (error < 0)
return error;
break;
@@ -855,7 +856,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
case SCTP_CID_ASCONF:
case SCTP_CID_FWD_TSN:
status = sctp_packet_transmit_chunk(packet, chunk,
- one_packet);
+ one_packet, gfp);
if (status != SCTP_XMIT_OK) {
/* put the chunk back */
list_add(&chunk->list, &q->control_chunk_list);
@@ -977,8 +978,12 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
(new_transport->state == SCTP_UNCONFIRMED) ||
(new_transport->state == SCTP_PF)))
new_transport = asoc->peer.active_path;
- if (new_transport->state == SCTP_UNCONFIRMED)
+ if (new_transport->state == SCTP_UNCONFIRMED) {
+ WARN_ONCE(1, "Atempt to send packet on unconfirmed path.");
+ sctp_chunk_fail(chunk, 0);
+ sctp_chunk_free(chunk);
continue;
+ }
/* Change packets if necessary. */
if (new_transport != transport) {
@@ -1010,7 +1015,7 @@ static int sctp_outq_flush(struct sctp_outq *q, int rtx_timeout)
atomic_read(&chunk->skb->users) : -1);
/* Add the chunk to the packet. */
- status = sctp_packet_transmit_chunk(packet, chunk, 0);
+ status = sctp_packet_transmit_chunk(packet, chunk, 0, gfp);
switch (status) {
case SCTP_XMIT_PMTU_FULL:
@@ -1087,7 +1092,7 @@ sctp_flush_out:
send_ready);
packet = &t->packet;
if (!sctp_packet_empty(packet))
- error = sctp_packet_transmit(packet);
+ error = sctp_packet_transmit(packet, gfp);
/* Clear the burst limited state, if any */
sctp_transport_burst_reset(t);
@@ -1251,6 +1256,7 @@ int sctp_outq_sack(struct sctp_outq *q, struct sctp_chunk *chunk)
*/
sack_a_rwnd = ntohl(sack->a_rwnd);
+ asoc->peer.zero_window_announced = !sack_a_rwnd;
outstanding = q->outstanding_bytes;
if (outstanding < sack_a_rwnd)
diff --git a/net/sctp/probe.c b/net/sctp/probe.c
index 5e68b94ee640..6cc2152e0740 100644
--- a/net/sctp/probe.c
+++ b/net/sctp/probe.c
@@ -65,7 +65,7 @@ static struct {
struct kfifo fifo;
spinlock_t lock;
wait_queue_head_t wait;
- struct timespec tstart;
+ struct timespec64 tstart;
} sctpw;
static __printf(1, 2) void printl(const char *fmt, ...)
@@ -85,7 +85,7 @@ static __printf(1, 2) void printl(const char *fmt, ...)
static int sctpprobe_open(struct inode *inode, struct file *file)
{
kfifo_reset(&sctpw.fifo);
- getnstimeofday(&sctpw.tstart);
+ ktime_get_ts64(&sctpw.tstart);
return 0;
}
@@ -138,7 +138,7 @@ static sctp_disposition_t jsctp_sf_eat_sack(struct net *net,
struct sk_buff *skb = chunk->skb;
struct sctp_transport *sp;
static __u32 lcwnd = 0;
- struct timespec now;
+ struct timespec64 now;
sp = asoc->peer.primary_path;
@@ -149,8 +149,8 @@ static sctp_disposition_t jsctp_sf_eat_sack(struct net *net,
(full || sp->cwnd != lcwnd)) {
lcwnd = sp->cwnd;
- getnstimeofday(&now);
- now = timespec_sub(now, sctpw.tstart);
+ ktime_get_ts64(&now);
+ now = timespec64_sub(now, sctpw.tstart);
printl("%lu.%06lu ", (unsigned long) now.tv_sec,
(unsigned long) now.tv_nsec / NSEC_PER_USEC);
diff --git a/net/sctp/proc.c b/net/sctp/proc.c
index 0697eda5aed8..5cfac8d5d3b3 100644
--- a/net/sctp/proc.c
+++ b/net/sctp/proc.c
@@ -161,12 +161,9 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa
struct sctp_af *af;
primary = &assoc->peer.primary_addr;
- rcu_read_lock();
list_for_each_entry_rcu(transport, &assoc->peer.transport_addr_list,
transports) {
addr = &transport->ipaddr;
- if (transport->dead)
- continue;
af = sctp_get_af_specific(addr->sa.sa_family);
if (af->cmp_addr(addr, primary)) {
@@ -174,7 +171,6 @@ static void sctp_seq_dump_remote_addrs(struct seq_file *seq, struct sctp_associa
}
af->seq_dump_addr(seq, addr);
}
- rcu_read_unlock();
}
static void *sctp_eps_seq_start(struct seq_file *seq, loff_t *pos)
@@ -281,88 +277,140 @@ void sctp_eps_proc_exit(struct net *net)
remove_proc_entry("eps", net->sctp.proc_net_sctp);
}
+struct sctp_ht_iter {
+ struct seq_net_private p;
+ struct rhashtable_iter hti;
+};
-static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos)
+static struct sctp_transport *sctp_transport_get_next(struct seq_file *seq)
{
- if (*pos >= sctp_assoc_hashsize)
- return NULL;
+ struct sctp_ht_iter *iter = seq->private;
+ struct sctp_transport *t;
- if (*pos < 0)
- *pos = 0;
+ t = rhashtable_walk_next(&iter->hti);
+ for (; t; t = rhashtable_walk_next(&iter->hti)) {
+ if (IS_ERR(t)) {
+ if (PTR_ERR(t) == -EAGAIN)
+ continue;
+ break;
+ }
- if (*pos == 0)
- seq_printf(seq, " ASSOC SOCK STY SST ST HBKT "
- "ASSOC-ID TX_QUEUE RX_QUEUE UID INODE LPORT "
- "RPORT LADDRS <-> RADDRS "
- "HBINT INS OUTS MAXRT T1X T2X RTXC "
- "wmema wmemq sndbuf rcvbuf\n");
+ if (net_eq(sock_net(t->asoc->base.sk), seq_file_net(seq)) &&
+ t->asoc->peer.primary_path == t)
+ break;
+ }
- return (void *)pos;
+ return t;
}
-static void sctp_assocs_seq_stop(struct seq_file *seq, void *v)
+static struct sctp_transport *sctp_transport_get_idx(struct seq_file *seq,
+ loff_t pos)
+{
+ void *obj = SEQ_START_TOKEN;
+
+ while (pos && (obj = sctp_transport_get_next(seq)) && !IS_ERR(obj))
+ pos--;
+
+ return obj;
+}
+
+static int sctp_transport_walk_start(struct seq_file *seq)
+{
+ struct sctp_ht_iter *iter = seq->private;
+ int err;
+
+ err = rhashtable_walk_init(&sctp_transport_hashtable, &iter->hti);
+ if (err)
+ return err;
+
+ err = rhashtable_walk_start(&iter->hti);
+
+ return err == -EAGAIN ? 0 : err;
+}
+
+static void sctp_transport_walk_stop(struct seq_file *seq)
{
+ struct sctp_ht_iter *iter = seq->private;
+
+ rhashtable_walk_stop(&iter->hti);
+ rhashtable_walk_exit(&iter->hti);
}
+static void *sctp_assocs_seq_start(struct seq_file *seq, loff_t *pos)
+{
+ int err = sctp_transport_walk_start(seq);
+
+ if (err)
+ return ERR_PTR(err);
+
+ return sctp_transport_get_idx(seq, *pos);
+}
+
+static void sctp_assocs_seq_stop(struct seq_file *seq, void *v)
+{
+ sctp_transport_walk_stop(seq);
+}
static void *sctp_assocs_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- if (++*pos >= sctp_assoc_hashsize)
- return NULL;
+ ++*pos;
- return pos;
+ return sctp_transport_get_next(seq);
}
/* Display sctp associations (/proc/net/sctp/assocs). */
static int sctp_assocs_seq_show(struct seq_file *seq, void *v)
{
- struct sctp_hashbucket *head;
- struct sctp_ep_common *epb;
+ struct sctp_transport *transport;
struct sctp_association *assoc;
+ struct sctp_ep_common *epb;
struct sock *sk;
- int hash = *(loff_t *)v;
- if (hash >= sctp_assoc_hashsize)
- return -ENOMEM;
-
- head = &sctp_assoc_hashtable[hash];
- local_bh_disable();
- read_lock(&head->lock);
- sctp_for_each_hentry(epb, &head->chain) {
- assoc = sctp_assoc(epb);
- sk = epb->sk;
- if (!net_eq(sock_net(sk), seq_file_net(seq)))
- continue;
- seq_printf(seq,
- "%8pK %8pK %-3d %-3d %-2d %-4d "
- "%4d %8d %8d %7u %5lu %-5d %5d ",
- assoc, sk, sctp_sk(sk)->type, sk->sk_state,
- assoc->state, hash,
- assoc->assoc_id,
- assoc->sndbuf_used,
- atomic_read(&assoc->rmem_alloc),
- from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
- sock_i_ino(sk),
- epb->bind_addr.port,
- assoc->peer.port);
- seq_printf(seq, " ");
- sctp_seq_dump_local_addrs(seq, epb);
- seq_printf(seq, "<-> ");
- sctp_seq_dump_remote_addrs(seq, assoc);
- seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d "
- "%8d %8d %8d %8d",
- assoc->hbinterval, assoc->c.sinit_max_instreams,
- assoc->c.sinit_num_ostreams, assoc->max_retrans,
- assoc->init_retries, assoc->shutdown_retries,
- assoc->rtx_data_chunks,
- atomic_read(&sk->sk_wmem_alloc),
- sk->sk_wmem_queued,
- sk->sk_sndbuf,
- sk->sk_rcvbuf);
- seq_printf(seq, "\n");
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, " ASSOC SOCK STY SST ST HBKT "
+ "ASSOC-ID TX_QUEUE RX_QUEUE UID INODE LPORT "
+ "RPORT LADDRS <-> RADDRS "
+ "HBINT INS OUTS MAXRT T1X T2X RTXC "
+ "wmema wmemq sndbuf rcvbuf\n");
+ return 0;
}
- read_unlock(&head->lock);
- local_bh_enable();
+
+ transport = (struct sctp_transport *)v;
+ if (!sctp_transport_hold(transport))
+ return 0;
+ assoc = transport->asoc;
+ epb = &assoc->base;
+ sk = epb->sk;
+
+ seq_printf(seq,
+ "%8pK %8pK %-3d %-3d %-2d %-4d "
+ "%4d %8d %8d %7u %5lu %-5d %5d ",
+ assoc, sk, sctp_sk(sk)->type, sk->sk_state,
+ assoc->state, 0,
+ assoc->assoc_id,
+ assoc->sndbuf_used,
+ atomic_read(&assoc->rmem_alloc),
+ from_kuid_munged(seq_user_ns(seq), sock_i_uid(sk)),
+ sock_i_ino(sk),
+ epb->bind_addr.port,
+ assoc->peer.port);
+ seq_printf(seq, " ");
+ sctp_seq_dump_local_addrs(seq, epb);
+ seq_printf(seq, "<-> ");
+ sctp_seq_dump_remote_addrs(seq, assoc);
+ seq_printf(seq, "\t%8lu %5d %5d %4d %4d %4d %8d "
+ "%8d %8d %8d %8d",
+ assoc->hbinterval, assoc->c.sinit_max_instreams,
+ assoc->c.sinit_num_ostreams, assoc->max_retrans,
+ assoc->init_retries, assoc->shutdown_retries,
+ assoc->rtx_data_chunks,
+ atomic_read(&sk->sk_wmem_alloc),
+ sk->sk_wmem_queued,
+ sk->sk_sndbuf,
+ sk->sk_rcvbuf);
+ seq_printf(seq, "\n");
+
+ sctp_transport_put(transport);
return 0;
}
@@ -378,7 +426,7 @@ static const struct seq_operations sctp_assoc_ops = {
static int sctp_assocs_seq_open(struct inode *inode, struct file *file)
{
return seq_open_net(inode, file, &sctp_assoc_ops,
- sizeof(struct seq_net_private));
+ sizeof(struct sctp_ht_iter));
}
static const struct file_operations sctp_assocs_seq_fops = {
@@ -409,112 +457,96 @@ void sctp_assocs_proc_exit(struct net *net)
static void *sctp_remaddr_seq_start(struct seq_file *seq, loff_t *pos)
{
- if (*pos >= sctp_assoc_hashsize)
- return NULL;
-
- if (*pos < 0)
- *pos = 0;
+ int err = sctp_transport_walk_start(seq);
- if (*pos == 0)
- seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX "
- "REM_ADDR_RTX START STATE\n");
+ if (err)
+ return ERR_PTR(err);
- return (void *)pos;
+ return sctp_transport_get_idx(seq, *pos);
}
static void *sctp_remaddr_seq_next(struct seq_file *seq, void *v, loff_t *pos)
{
- if (++*pos >= sctp_assoc_hashsize)
- return NULL;
+ ++*pos;
- return pos;
+ return sctp_transport_get_next(seq);
}
static void sctp_remaddr_seq_stop(struct seq_file *seq, void *v)
{
+ sctp_transport_walk_stop(seq);
}
static int sctp_remaddr_seq_show(struct seq_file *seq, void *v)
{
- struct sctp_hashbucket *head;
- struct sctp_ep_common *epb;
struct sctp_association *assoc;
- struct sctp_transport *tsp;
- int hash = *(loff_t *)v;
+ struct sctp_transport *transport, *tsp;
- if (hash >= sctp_assoc_hashsize)
- return -ENOMEM;
+ if (v == SEQ_START_TOKEN) {
+ seq_printf(seq, "ADDR ASSOC_ID HB_ACT RTO MAX_PATH_RTX "
+ "REM_ADDR_RTX START STATE\n");
+ return 0;
+ }
- head = &sctp_assoc_hashtable[hash];
- local_bh_disable();
- read_lock(&head->lock);
- rcu_read_lock();
- sctp_for_each_hentry(epb, &head->chain) {
- if (!net_eq(sock_net(epb->sk), seq_file_net(seq)))
- continue;
- assoc = sctp_assoc(epb);
- list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list,
- transports) {
- if (tsp->dead)
- continue;
+ transport = (struct sctp_transport *)v;
+ if (!sctp_transport_hold(transport))
+ return 0;
+ assoc = transport->asoc;
+
+ list_for_each_entry_rcu(tsp, &assoc->peer.transport_addr_list,
+ transports) {
+ /*
+ * The remote address (ADDR)
+ */
+ tsp->af_specific->seq_dump_addr(seq, &tsp->ipaddr);
+ seq_printf(seq, " ");
+ /*
+ * The association ID (ASSOC_ID)
+ */
+ seq_printf(seq, "%d ", tsp->asoc->assoc_id);
+
+ /*
+ * If the Heartbeat is active (HB_ACT)
+ * Note: 1 = Active, 0 = Inactive
+ */
+ seq_printf(seq, "%d ", timer_pending(&tsp->hb_timer));
+
+ /*
+ * Retransmit time out (RTO)
+ */
+ seq_printf(seq, "%lu ", tsp->rto);
+
+ /*
+ * Maximum path retransmit count (PATH_MAX_RTX)
+ */
+ seq_printf(seq, "%d ", tsp->pathmaxrxt);
+
+ /*
+ * remote address retransmit count (REM_ADDR_RTX)
+ * Note: We don't have a way to tally this at the moment
+ * so lets just leave it as zero for the moment
+ */
+ seq_puts(seq, "0 ");
+
+ /*
+ * remote address start time (START). This is also not
+ * currently implemented, but we can record it with a
+ * jiffies marker in a subsequent patch
+ */
+ seq_puts(seq, "0 ");
+
+ /*
+ * The current state of this destination. I.e.
+ * SCTP_ACTIVE, SCTP_INACTIVE, ...
+ */
+ seq_printf(seq, "%d", tsp->state);
- /*
- * The remote address (ADDR)
- */
- tsp->af_specific->seq_dump_addr(seq, &tsp->ipaddr);
- seq_printf(seq, " ");
-
- /*
- * The association ID (ASSOC_ID)
- */
- seq_printf(seq, "%d ", tsp->asoc->assoc_id);
-
- /*
- * If the Heartbeat is active (HB_ACT)
- * Note: 1 = Active, 0 = Inactive
- */
- seq_printf(seq, "%d ", timer_pending(&tsp->hb_timer));
-
- /*
- * Retransmit time out (RTO)
- */
- seq_printf(seq, "%lu ", tsp->rto);
-
- /*
- * Maximum path retransmit count (PATH_MAX_RTX)
- */
- seq_printf(seq, "%d ", tsp->pathmaxrxt);
-
- /*
- * remote address retransmit count (REM_ADDR_RTX)
- * Note: We don't have a way to tally this at the moment
- * so lets just leave it as zero for the moment
- */
- seq_puts(seq, "0 ");
-
- /*
- * remote address start time (START). This is also not
- * currently implemented, but we can record it with a
- * jiffies marker in a subsequent patch
- */
- seq_puts(seq, "0 ");
-
- /*
- * The current state of this destination. I.e.
- * SCTP_ACTIVE, SCTP_INACTIVE, ...
- */
- seq_printf(seq, "%d", tsp->state);
-
- seq_printf(seq, "\n");
- }
+ seq_printf(seq, "\n");
}
- rcu_read_unlock();
- read_unlock(&head->lock);
- local_bh_enable();
+ sctp_transport_put(transport);
return 0;
-
}
static const struct seq_operations sctp_remaddr_ops = {
@@ -533,7 +565,7 @@ void sctp_remaddr_proc_exit(struct net *net)
static int sctp_remaddr_seq_open(struct inode *inode, struct file *file)
{
return seq_open_net(inode, file, &sctp_remaddr_ops,
- sizeof(struct seq_net_private));
+ sizeof(struct sctp_ht_iter));
}
static const struct file_operations sctp_remaddr_seq_fops = {
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index 3d9ea9a48289..d3d50daa248b 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -60,6 +60,8 @@
#include <net/inet_common.h>
#include <net/inet_ecn.h>
+#define MAX_SCTP_PORT_HASH_ENTRIES (64 * 1024)
+
/* Global data structures. */
struct sctp_globals sctp_globals __read_mostly;
@@ -214,6 +216,7 @@ int sctp_copy_local_addr_list(struct net *net, struct sctp_bind_addr *bp,
(copy_flags & SCTP_ADDR6_ALLOWED) &&
(copy_flags & SCTP_ADDR6_PEERSUPP)))) {
error = sctp_add_bind_addr(bp, &addr->a,
+ sizeof(addr->a),
SCTP_ADDR_SRC, GFP_ATOMIC);
if (error)
goto end_copy;
@@ -1223,6 +1226,9 @@ static int __net_init sctp_defaults_init(struct net *net)
/* Max.Burst - 4 */
net->sctp.max_burst = SCTP_DEFAULT_MAX_BURST;
+ /* Enable pf state by default */
+ net->sctp.pf_enable = 1;
+
/* Association.Max.Retrans - 10 attempts
* Path.Max.Retrans - 5 attempts (per destination address)
* Max.Init.Retransmits - 8 attempts
@@ -1352,6 +1358,8 @@ static __init int sctp_init(void)
unsigned long limit;
int max_share;
int order;
+ int num_entries;
+ int max_entry_order;
sock_skb_cb_check_size(sizeof(struct sctp_ulpevent));
@@ -1404,32 +1412,24 @@ static __init int sctp_init(void)
/* Size and allocate the association hash table.
* The methodology is similar to that of the tcp hash tables.
+ * Though not identical. Start by getting a goal size
*/
if (totalram_pages >= (128 * 1024))
goal = totalram_pages >> (22 - PAGE_SHIFT);
else
goal = totalram_pages >> (24 - PAGE_SHIFT);
- for (order = 0; (1UL << order) < goal; order++)
- ;
+ /* Then compute the page order for said goal */
+ order = get_order(goal);
- do {
- sctp_assoc_hashsize = (1UL << order) * PAGE_SIZE /
- sizeof(struct sctp_hashbucket);
- if ((sctp_assoc_hashsize > (64 * 1024)) && order > 0)
- continue;
- sctp_assoc_hashtable = (struct sctp_hashbucket *)
- __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order);
- } while (!sctp_assoc_hashtable && --order > 0);
- if (!sctp_assoc_hashtable) {
- pr_err("Failed association hash alloc\n");
- status = -ENOMEM;
- goto err_ahash_alloc;
- }
- for (i = 0; i < sctp_assoc_hashsize; i++) {
- rwlock_init(&sctp_assoc_hashtable[i].lock);
- INIT_HLIST_HEAD(&sctp_assoc_hashtable[i].chain);
- }
+ /* Now compute the required page order for the maximum sized table we
+ * want to create
+ */
+ max_entry_order = get_order(MAX_SCTP_PORT_HASH_ENTRIES *
+ sizeof(struct sctp_bind_hashbucket));
+
+ /* Limit the page order by that maximum hash table size */
+ order = min(order, max_entry_order);
/* Allocate and initialize the endpoint hash table. */
sctp_ep_hashsize = 64;
@@ -1445,27 +1445,45 @@ static __init int sctp_init(void)
INIT_HLIST_HEAD(&sctp_ep_hashtable[i].chain);
}
- /* Allocate and initialize the SCTP port hash table. */
+ /* Allocate and initialize the SCTP port hash table.
+ * Note that order is initalized to start at the max sized
+ * table we want to support. If we can't get that many pages
+ * reduce the order and try again
+ */
do {
- sctp_port_hashsize = (1UL << order) * PAGE_SIZE /
- sizeof(struct sctp_bind_hashbucket);
- if ((sctp_port_hashsize > (64 * 1024)) && order > 0)
- continue;
sctp_port_hashtable = (struct sctp_bind_hashbucket *)
- __get_free_pages(GFP_ATOMIC|__GFP_NOWARN, order);
+ __get_free_pages(GFP_KERNEL | __GFP_NOWARN, order);
} while (!sctp_port_hashtable && --order > 0);
+
if (!sctp_port_hashtable) {
pr_err("Failed bind hash alloc\n");
status = -ENOMEM;
goto err_bhash_alloc;
}
+
+ /* Now compute the number of entries that will fit in the
+ * port hash space we allocated
+ */
+ num_entries = (1UL << order) * PAGE_SIZE /
+ sizeof(struct sctp_bind_hashbucket);
+
+ /* And finish by rounding it down to the nearest power of two
+ * this wastes some memory of course, but its needed because
+ * the hash function operates based on the assumption that
+ * that the number of entries is a power of two
+ */
+ sctp_port_hashsize = rounddown_pow_of_two(num_entries);
+
for (i = 0; i < sctp_port_hashsize; i++) {
spin_lock_init(&sctp_port_hashtable[i].lock);
INIT_HLIST_HEAD(&sctp_port_hashtable[i].chain);
}
- pr_info("Hash tables configured (established %d bind %d)\n",
- sctp_assoc_hashsize, sctp_port_hashsize);
+ if (sctp_transport_hashtable_init())
+ goto err_thash_alloc;
+
+ pr_info("Hash tables configured (bind %d/%d)\n", sctp_port_hashsize,
+ num_entries);
sctp_sysctl_register();
@@ -1518,12 +1536,10 @@ err_register_defaults:
get_order(sctp_port_hashsize *
sizeof(struct sctp_bind_hashbucket)));
err_bhash_alloc:
+ sctp_transport_hashtable_destroy();
+err_thash_alloc:
kfree(sctp_ep_hashtable);
err_ehash_alloc:
- free_pages((unsigned long)sctp_assoc_hashtable,
- get_order(sctp_assoc_hashsize *
- sizeof(struct sctp_hashbucket)));
-err_ahash_alloc:
percpu_counter_destroy(&sctp_sockets_allocated);
err_percpu_counter_init:
kmem_cache_destroy(sctp_chunk_cachep);
@@ -1557,13 +1573,11 @@ static __exit void sctp_exit(void)
sctp_sysctl_unregister();
- free_pages((unsigned long)sctp_assoc_hashtable,
- get_order(sctp_assoc_hashsize *
- sizeof(struct sctp_hashbucket)));
- kfree(sctp_ep_hashtable);
free_pages((unsigned long)sctp_port_hashtable,
get_order(sctp_port_hashsize *
sizeof(struct sctp_bind_hashbucket)));
+ kfree(sctp_ep_hashtable);
+ sctp_transport_hashtable_destroy();
percpu_counter_destroy(&sctp_sockets_allocated);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 763e06a55155..7f0bf798205b 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -45,6 +45,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <crypto/hash.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/ip.h>
@@ -52,7 +53,6 @@
#include <linux/net.h>
#include <linux/inet.h>
#include <linux/scatterlist.h>
-#include <linux/crypto.h>
#include <linux/slab.h>
#include <net/sock.h>
@@ -62,11 +62,13 @@
#include <net/sctp/sm.h>
static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc,
- __u8 type, __u8 flags, int paylen);
+ __u8 type, __u8 flags, int paylen,
+ gfp_t gfp);
static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
- __u8 flags, int paylen);
+ __u8 flags, int paylen, gfp_t gfp);
static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
- __u8 type, __u8 flags, int paylen);
+ __u8 type, __u8 flags, int paylen,
+ gfp_t gfp);
static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
const struct sctp_association *asoc,
const struct sctp_chunk *init_chunk,
@@ -318,7 +320,7 @@ struct sctp_chunk *sctp_make_init(const struct sctp_association *asoc,
* PLEASE DO NOT FIXME [This version does not support Host Name.]
*/
- retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize);
+ retval = sctp_make_control(asoc, SCTP_CID_INIT, 0, chunksize, gfp);
if (!retval)
goto nodata;
@@ -465,7 +467,7 @@ struct sctp_chunk *sctp_make_init_ack(const struct sctp_association *asoc,
num_ext);
/* Now allocate and fill out the chunk. */
- retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize);
+ retval = sctp_make_control(asoc, SCTP_CID_INIT_ACK, 0, chunksize, gfp);
if (!retval)
goto nomem_chunk;
@@ -570,7 +572,8 @@ struct sctp_chunk *sctp_make_cookie_echo(const struct sctp_association *asoc,
cookie_len = asoc->peer.cookie_len;
/* Build a cookie echo chunk. */
- retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0, cookie_len);
+ retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ECHO, 0,
+ cookie_len, GFP_ATOMIC);
if (!retval)
goto nodata;
retval->subh.cookie_hdr =
@@ -615,7 +618,7 @@ struct sctp_chunk *sctp_make_cookie_ack(const struct sctp_association *asoc,
{
struct sctp_chunk *retval;
- retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0);
+ retval = sctp_make_control(asoc, SCTP_CID_COOKIE_ACK, 0, 0, GFP_ATOMIC);
/* RFC 2960 6.4 Multi-homed SCTP Endpoints
*
@@ -664,7 +667,7 @@ struct sctp_chunk *sctp_make_cwr(const struct sctp_association *asoc,
cwr.lowest_tsn = htonl(lowest_tsn);
retval = sctp_make_control(asoc, SCTP_CID_ECN_CWR, 0,
- sizeof(sctp_cwrhdr_t));
+ sizeof(sctp_cwrhdr_t), GFP_ATOMIC);
if (!retval)
goto nodata;
@@ -698,7 +701,7 @@ struct sctp_chunk *sctp_make_ecne(const struct sctp_association *asoc,
ecne.lowest_tsn = htonl(lowest_tsn);
retval = sctp_make_control(asoc, SCTP_CID_ECN_ECNE, 0,
- sizeof(sctp_ecnehdr_t));
+ sizeof(sctp_ecnehdr_t), GFP_ATOMIC);
if (!retval)
goto nodata;
retval->subh.ecne_hdr =
@@ -713,7 +716,8 @@ nodata:
*/
struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
const struct sctp_sndrcvinfo *sinfo,
- int data_len, __u8 flags, __u16 ssn)
+ int data_len, __u8 flags, __u16 ssn,
+ gfp_t gfp)
{
struct sctp_chunk *retval;
struct sctp_datahdr dp;
@@ -734,7 +738,7 @@ struct sctp_chunk *sctp_make_datafrag_empty(struct sctp_association *asoc,
dp.ssn = htons(ssn);
chunk_len = sizeof(dp) + data_len;
- retval = sctp_make_data(asoc, flags, chunk_len);
+ retval = sctp_make_data(asoc, flags, chunk_len, gfp);
if (!retval)
goto nodata;
@@ -781,7 +785,7 @@ struct sctp_chunk *sctp_make_sack(const struct sctp_association *asoc)
+ sizeof(__u32) * num_dup_tsns;
/* Create the chunk. */
- retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len);
+ retval = sctp_make_control(asoc, SCTP_CID_SACK, 0, len, GFP_ATOMIC);
if (!retval)
goto nodata;
@@ -861,7 +865,7 @@ struct sctp_chunk *sctp_make_shutdown(const struct sctp_association *asoc,
shut.cum_tsn_ack = htonl(ctsn);
retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN, 0,
- sizeof(sctp_shutdownhdr_t));
+ sizeof(sctp_shutdownhdr_t), GFP_ATOMIC);
if (!retval)
goto nodata;
@@ -879,7 +883,8 @@ struct sctp_chunk *sctp_make_shutdown_ack(const struct sctp_association *asoc,
{
struct sctp_chunk *retval;
- retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0);
+ retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_ACK, 0, 0,
+ GFP_ATOMIC);
/* RFC 2960 6.4 Multi-homed SCTP Endpoints
*
@@ -908,7 +913,8 @@ struct sctp_chunk *sctp_make_shutdown_complete(
*/
flags |= asoc ? 0 : SCTP_CHUNK_FLAG_T;
- retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags, 0);
+ retval = sctp_make_control(asoc, SCTP_CID_SHUTDOWN_COMPLETE, flags,
+ 0, GFP_ATOMIC);
/* RFC 2960 6.4 Multi-homed SCTP Endpoints
*
@@ -947,7 +953,8 @@ struct sctp_chunk *sctp_make_abort(const struct sctp_association *asoc,
flags = SCTP_CHUNK_FLAG_T;
}
- retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint);
+ retval = sctp_make_control(asoc, SCTP_CID_ABORT, flags, hint,
+ GFP_ATOMIC);
/* RFC 2960 6.4 Multi-homed SCTP Endpoints
*
@@ -1139,7 +1146,8 @@ struct sctp_chunk *sctp_make_heartbeat(const struct sctp_association *asoc,
struct sctp_chunk *retval;
sctp_sender_hb_info_t hbinfo;
- retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0, sizeof(hbinfo));
+ retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT, 0,
+ sizeof(hbinfo), GFP_ATOMIC);
if (!retval)
goto nodata;
@@ -1167,7 +1175,8 @@ struct sctp_chunk *sctp_make_heartbeat_ack(const struct sctp_association *asoc,
{
struct sctp_chunk *retval;
- retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen);
+ retval = sctp_make_control(asoc, SCTP_CID_HEARTBEAT_ACK, 0, paylen,
+ GFP_ATOMIC);
if (!retval)
goto nodata;
@@ -1200,7 +1209,7 @@ static struct sctp_chunk *sctp_make_op_error_space(
struct sctp_chunk *retval;
retval = sctp_make_control(asoc, SCTP_CID_ERROR, 0,
- sizeof(sctp_errhdr_t) + size);
+ sizeof(sctp_errhdr_t) + size, GFP_ATOMIC);
if (!retval)
goto nodata;
@@ -1271,7 +1280,8 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
return NULL;
retval = sctp_make_control(asoc, SCTP_CID_AUTH, 0,
- hmac_desc->hmac_len + sizeof(sctp_authhdr_t));
+ hmac_desc->hmac_len + sizeof(sctp_authhdr_t),
+ GFP_ATOMIC);
if (!retval)
return NULL;
@@ -1309,11 +1319,11 @@ struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc)
*/
struct sctp_chunk *sctp_chunkify(struct sk_buff *skb,
const struct sctp_association *asoc,
- struct sock *sk)
+ struct sock *sk, gfp_t gfp)
{
struct sctp_chunk *retval;
- retval = kmem_cache_zalloc(sctp_chunk_cachep, GFP_ATOMIC);
+ retval = kmem_cache_zalloc(sctp_chunk_cachep, gfp);
if (!retval)
goto nodata;
@@ -1361,7 +1371,8 @@ const union sctp_addr *sctp_source(const struct sctp_chunk *chunk)
* arguments, reserving enough space for a 'paylen' byte payload.
*/
static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
- __u8 type, __u8 flags, int paylen)
+ __u8 type, __u8 flags, int paylen,
+ gfp_t gfp)
{
struct sctp_chunk *retval;
sctp_chunkhdr_t *chunk_hdr;
@@ -1369,8 +1380,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
struct sock *sk;
/* No need to allocate LL here, as this is only a chunk. */
- skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen),
- GFP_ATOMIC);
+ skb = alloc_skb(WORD_ROUND(sizeof(sctp_chunkhdr_t) + paylen), gfp);
if (!skb)
goto nodata;
@@ -1381,7 +1391,7 @@ static struct sctp_chunk *_sctp_make_chunk(const struct sctp_association *asoc,
chunk_hdr->length = htons(sizeof(sctp_chunkhdr_t));
sk = asoc ? asoc->base.sk : NULL;
- retval = sctp_chunkify(skb, asoc, sk);
+ retval = sctp_chunkify(skb, asoc, sk, gfp);
if (!retval) {
kfree_skb(skb);
goto nodata;
@@ -1400,16 +1410,18 @@ nodata:
}
static struct sctp_chunk *sctp_make_data(const struct sctp_association *asoc,
- __u8 flags, int paylen)
+ __u8 flags, int paylen, gfp_t gfp)
{
- return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen);
+ return _sctp_make_chunk(asoc, SCTP_CID_DATA, flags, paylen, gfp);
}
static struct sctp_chunk *sctp_make_control(const struct sctp_association *asoc,
- __u8 type, __u8 flags, int paylen)
+ __u8 type, __u8 flags, int paylen,
+ gfp_t gfp)
{
- struct sctp_chunk *chunk = _sctp_make_chunk(asoc, type, flags, paylen);
+ struct sctp_chunk *chunk;
+ chunk = _sctp_make_chunk(asoc, type, flags, paylen, gfp);
if (chunk)
sctp_control_set_owner_w(chunk);
@@ -1606,7 +1618,6 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
{
sctp_cookie_param_t *retval;
struct sctp_signed_cookie *cookie;
- struct scatterlist sg;
int headersize, bodysize;
/* Header size is static data prior to the actual cookie, including
@@ -1652,7 +1663,7 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
/* Set an expiration time for the cookie. */
cookie->c.expiration = ktime_add(asoc->cookie_life,
- ktime_get());
+ ktime_get_real());
/* Copy the peer's init packet. */
memcpy(&cookie->c.peer_init[0], init_chunk->chunk_hdr,
@@ -1663,16 +1674,19 @@ static sctp_cookie_param_t *sctp_pack_cookie(const struct sctp_endpoint *ep,
ntohs(init_chunk->chunk_hdr->length), raw_addrs, addrs_len);
if (sctp_sk(ep->base.sk)->hmac) {
- struct hash_desc desc;
+ SHASH_DESC_ON_STACK(desc, sctp_sk(ep->base.sk)->hmac);
+ int err;
/* Sign the message. */
- sg_init_one(&sg, &cookie->c, bodysize);
- desc.tfm = sctp_sk(ep->base.sk)->hmac;
- desc.flags = 0;
-
- if (crypto_hash_setkey(desc.tfm, ep->secret_key,
- sizeof(ep->secret_key)) ||
- crypto_hash_digest(&desc, &sg, bodysize, cookie->signature))
+ desc->tfm = sctp_sk(ep->base.sk)->hmac;
+ desc->flags = 0;
+
+ err = crypto_shash_setkey(desc->tfm, ep->secret_key,
+ sizeof(ep->secret_key)) ?:
+ crypto_shash_digest(desc, (u8 *)&cookie->c, bodysize,
+ cookie->signature);
+ shash_desc_zero(desc);
+ if (err)
goto free_cookie;
}
@@ -1697,12 +1711,10 @@ struct sctp_association *sctp_unpack_cookie(
struct sctp_cookie *bear_cookie;
int headersize, bodysize, fixed_size;
__u8 *digest = ep->digest;
- struct scatterlist sg;
unsigned int len;
sctp_scope_t scope;
struct sk_buff *skb = chunk->skb;
ktime_t kt;
- struct hash_desc desc;
/* Header size is static data prior to the actual cookie, including
* any padding.
@@ -1733,16 +1745,23 @@ struct sctp_association *sctp_unpack_cookie(
goto no_hmac;
/* Check the signature. */
- sg_init_one(&sg, bear_cookie, bodysize);
- desc.tfm = sctp_sk(ep->base.sk)->hmac;
- desc.flags = 0;
-
- memset(digest, 0x00, SCTP_SIGNATURE_SIZE);
- if (crypto_hash_setkey(desc.tfm, ep->secret_key,
- sizeof(ep->secret_key)) ||
- crypto_hash_digest(&desc, &sg, bodysize, digest)) {
- *error = -SCTP_IERROR_NOMEM;
- goto fail;
+ {
+ SHASH_DESC_ON_STACK(desc, sctp_sk(ep->base.sk)->hmac);
+ int err;
+
+ desc->tfm = sctp_sk(ep->base.sk)->hmac;
+ desc->flags = 0;
+
+ err = crypto_shash_setkey(desc->tfm, ep->secret_key,
+ sizeof(ep->secret_key)) ?:
+ crypto_shash_digest(desc, (u8 *)bear_cookie, bodysize,
+ digest);
+ shash_desc_zero(desc);
+
+ if (err) {
+ *error = -SCTP_IERROR_NOMEM;
+ goto fail;
+ }
}
if (memcmp(digest, cookie->signature, SCTP_SIGNATURE_SIZE)) {
@@ -1780,7 +1799,7 @@ no_hmac:
if (sock_flag(ep->base.sk, SOCK_TIMESTAMP))
kt = skb_get_ktime(skb);
else
- kt = ktime_get();
+ kt = ktime_get_real();
if (!asoc && ktime_before(bear_cookie->expiration, kt)) {
/*
@@ -1830,7 +1849,8 @@ no_hmac:
/* Also, add the destination address. */
if (list_empty(&retval->base.bind_addr.address_list)) {
sctp_add_bind_addr(&retval->base.bind_addr, &chunk->dest,
- SCTP_ADDR_SRC, GFP_ATOMIC);
+ sizeof(chunk->dest), SCTP_ADDR_SRC,
+ GFP_ATOMIC);
}
retval->next_tsn = retval->c.initial_tsn;
@@ -2756,7 +2776,8 @@ static struct sctp_chunk *sctp_make_asconf(struct sctp_association *asoc,
length += addrlen;
/* Create the chunk. */
- retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length);
+ retval = sctp_make_control(asoc, SCTP_CID_ASCONF, 0, length,
+ GFP_ATOMIC);
if (!retval)
return NULL;
@@ -2940,7 +2961,8 @@ static struct sctp_chunk *sctp_make_asconf_ack(const struct sctp_association *as
int length = sizeof(asconf) + vparam_len;
/* Create the chunk. */
- retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length);
+ retval = sctp_make_control(asoc, SCTP_CID_ASCONF_ACK, 0, length,
+ GFP_ATOMIC);
if (!retval)
return NULL;
@@ -3500,7 +3522,7 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct sctp_association *asoc,
hint = (nstreams + 1) * sizeof(__u32);
- retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint);
+ retval = sctp_make_control(asoc, SCTP_CID_FWD_TSN, 0, hint, GFP_ATOMIC);
if (!retval)
return NULL;
diff --git a/net/sctp/sm_sideeffect.c b/net/sctp/sm_sideeffect.c
index 6098d4c42fa9..7fe56d0acabf 100644
--- a/net/sctp/sm_sideeffect.c
+++ b/net/sctp/sm_sideeffect.c
@@ -63,7 +63,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
sctp_state_t state,
struct sctp_endpoint *ep,
- struct sctp_association *asoc,
+ struct sctp_association **asoc,
void *event_arg,
sctp_disposition_t status,
sctp_cmd_seq_t *commands,
@@ -215,10 +215,14 @@ static int sctp_gen_sack(struct sctp_association *asoc, int force,
sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_RESTART,
SCTP_TO(SCTP_EVENT_TIMEOUT_SACK));
} else {
+ __u32 old_a_rwnd = asoc->a_rwnd;
+
asoc->a_rwnd = asoc->rwnd;
sack = sctp_make_sack(asoc);
- if (!sack)
+ if (!sack) {
+ asoc->a_rwnd = old_a_rwnd;
goto nomem;
+ }
asoc->peer.sack_needed = 0;
asoc->peer.sack_cnt = 0;
@@ -259,12 +263,6 @@ void sctp_generate_t3_rtx_event(unsigned long peer)
goto out_unlock;
}
- /* Is this transport really dead and just waiting around for
- * the timer to let go of the reference?
- */
- if (transport->dead)
- goto out_unlock;
-
/* Run through the state machine. */
error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_T3_RTX),
@@ -380,12 +378,6 @@ void sctp_generate_heartbeat_event(unsigned long data)
goto out_unlock;
}
- /* Is this structure just waiting around for us to actually
- * get destroyed?
- */
- if (transport->dead)
- goto out_unlock;
-
error = sctp_do_sm(net, SCTP_EVENT_T_TIMEOUT,
SCTP_ST_TIMEOUT(SCTP_EVENT_TIMEOUT_HEARTBEAT),
asoc->state, asoc->ep, asoc,
@@ -477,6 +469,8 @@ static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
struct sctp_transport *transport,
int is_hb)
{
+ struct net *net = sock_net(asoc->base.sk);
+
/* The check for association's overall error counter exceeding the
* threshold is done in the state function.
*/
@@ -503,7 +497,8 @@ static void sctp_do_8_2_transport_strike(sctp_cmd_seq_t *commands,
* is SCTP_ACTIVE, then mark this transport as Partially Failed,
* see SCTP Quick Failover Draft, section 5.1
*/
- if ((transport->state == SCTP_ACTIVE) &&
+ if (net->sctp.pf_enable &&
+ (transport->state == SCTP_ACTIVE) &&
(asoc->pf_retrans < transport->pathmaxrxt) &&
(transport->error_count > asoc->pf_retrans)) {
@@ -863,7 +858,6 @@ static void sctp_cmd_delete_tcb(sctp_cmd_seq_t *cmds,
(!asoc->temp) && (sk->sk_shutdown != SHUTDOWN_MASK))
return;
- sctp_unhash_established(asoc);
sctp_association_free(asoc);
}
@@ -1029,13 +1023,13 @@ static void sctp_cmd_t1_timer_update(struct sctp_association *asoc,
* encouraged for small fragments.
*/
static int sctp_cmd_send_msg(struct sctp_association *asoc,
- struct sctp_datamsg *msg)
+ struct sctp_datamsg *msg, gfp_t gfp)
{
struct sctp_chunk *chunk;
int error = 0;
list_for_each_entry(chunk, &msg->chunks, frag_list) {
- error = sctp_outq_tail(&asoc->outqueue, chunk);
+ error = sctp_outq_tail(&asoc->outqueue, chunk, gfp);
if (error)
break;
}
@@ -1123,7 +1117,7 @@ int sctp_do_sm(struct net *net, sctp_event_t event_type, sctp_subtype_t subtype,
debug_post_sfn();
error = sctp_side_effects(event_type, subtype, state,
- ep, asoc, event_arg, status,
+ ep, &asoc, event_arg, status,
&commands, gfp);
debug_post_sfx();
@@ -1136,7 +1130,7 @@ int sctp_do_sm(struct net *net, sctp_event_t event_type, sctp_subtype_t subtype,
static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
sctp_state_t state,
struct sctp_endpoint *ep,
- struct sctp_association *asoc,
+ struct sctp_association **asoc,
void *event_arg,
sctp_disposition_t status,
sctp_cmd_seq_t *commands,
@@ -1151,7 +1145,7 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
* disposition SCTP_DISPOSITION_CONSUME.
*/
if (0 != (error = sctp_cmd_interpreter(event_type, subtype, state,
- ep, asoc,
+ ep, *asoc,
event_arg, status,
commands, gfp)))
goto bail;
@@ -1174,11 +1168,12 @@ static int sctp_side_effects(sctp_event_t event_type, sctp_subtype_t subtype,
break;
case SCTP_DISPOSITION_DELETE_TCB:
+ case SCTP_DISPOSITION_ABORT:
/* This should now be a command. */
+ *asoc = NULL;
break;
case SCTP_DISPOSITION_CONSUME:
- case SCTP_DISPOSITION_ABORT:
/*
* We should no longer have much work to do here as the
* real work has been done as explicit commands above.
@@ -1258,7 +1253,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
case SCTP_CMD_NEW_ASOC:
/* Register a new association. */
if (local_cork) {
- sctp_outq_uncork(&asoc->outqueue);
+ sctp_outq_uncork(&asoc->outqueue, gfp);
local_cork = 0;
}
@@ -1266,7 +1261,6 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
asoc = cmd->obj.asoc;
BUG_ON(asoc->peer.primary_path == NULL);
sctp_endpoint_add_asoc(ep, asoc);
- sctp_hash_established(asoc);
break;
case SCTP_CMD_UPDATE_ASSOC:
@@ -1279,7 +1273,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
case SCTP_CMD_DELETE_TCB:
if (local_cork) {
- sctp_outq_uncork(&asoc->outqueue);
+ sctp_outq_uncork(&asoc->outqueue, gfp);
local_cork = 0;
}
/* Delete the current association. */
@@ -1433,13 +1427,14 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
local_cork = 1;
}
/* Send a chunk to our peer. */
- error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk);
+ error = sctp_outq_tail(&asoc->outqueue, cmd->obj.chunk,
+ gfp);
break;
case SCTP_CMD_SEND_PKT:
/* Send a full packet to our peer. */
packet = cmd->obj.packet;
- sctp_packet_transmit(packet);
+ sctp_packet_transmit(packet, gfp);
sctp_ootb_pkt_free(packet);
break;
@@ -1649,7 +1644,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
*/
chunk->pdiscard = 1;
if (asoc) {
- sctp_outq_uncork(&asoc->outqueue);
+ sctp_outq_uncork(&asoc->outqueue, gfp);
local_cork = 0;
}
break;
@@ -1687,7 +1682,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
case SCTP_CMD_FORCE_PRIM_RETRAN:
t = asoc->peer.retran_path;
asoc->peer.retran_path = asoc->peer.primary_path;
- error = sctp_outq_uncork(&asoc->outqueue);
+ error = sctp_outq_uncork(&asoc->outqueue, gfp);
local_cork = 0;
asoc->peer.retran_path = t;
break;
@@ -1714,7 +1709,7 @@ static int sctp_cmd_interpreter(sctp_event_t event_type,
sctp_outq_cork(&asoc->outqueue);
local_cork = 1;
}
- error = sctp_cmd_send_msg(asoc, cmd->obj.msg);
+ error = sctp_cmd_send_msg(asoc, cmd->obj.msg, gfp);
break;
case SCTP_CMD_SEND_NEXT_ASCONF:
sctp_cmd_send_asconf(asoc);
@@ -1744,9 +1739,9 @@ out:
*/
if (asoc && SCTP_EVENT_T_CHUNK == event_type && chunk) {
if (chunk->end_of_packet || chunk->singleton)
- error = sctp_outq_uncork(&asoc->outqueue);
+ error = sctp_outq_uncork(&asoc->outqueue, gfp);
} else if (local_cork)
- error = sctp_outq_uncork(&asoc->outqueue);
+ error = sctp_outq_uncork(&asoc->outqueue, gfp);
return error;
nomem:
error = -ENOMEM;
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 6f46aa16cb76..f1f08c8f277b 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -2976,7 +2976,7 @@ sctp_disposition_t sctp_sf_eat_data_6_2(struct net *net,
SCTP_INC_STATS(net, SCTP_MIB_IN_DATA_CHUNK_DISCARDS);
goto discard_force;
case SCTP_IERROR_NO_DATA:
- goto consume;
+ return SCTP_DISPOSITION_ABORT;
case SCTP_IERROR_PROTO_VIOLATION:
return sctp_sf_abort_violation(net, ep, asoc, chunk, commands,
(u8 *)chunk->subh.data_hdr, sizeof(sctp_datahdr_t));
@@ -3043,9 +3043,6 @@ discard_noforce:
sctp_add_cmd_sf(commands, SCTP_CMD_GEN_SACK, force);
return SCTP_DISPOSITION_DISCARD;
-consume:
- return SCTP_DISPOSITION_CONSUME;
-
}
/*
@@ -3093,7 +3090,7 @@ sctp_disposition_t sctp_sf_eat_data_fast_4_4(struct net *net,
case SCTP_IERROR_BAD_STREAM:
break;
case SCTP_IERROR_NO_DATA:
- goto consume;
+ return SCTP_DISPOSITION_ABORT;
case SCTP_IERROR_PROTO_VIOLATION:
return sctp_sf_abort_violation(net, ep, asoc, chunk, commands,
(u8 *)chunk->subh.data_hdr, sizeof(sctp_datahdr_t));
@@ -3119,7 +3116,6 @@ sctp_disposition_t sctp_sf_eat_data_fast_4_4(struct net *net,
SCTP_TO(SCTP_EVENT_TIMEOUT_T2_SHUTDOWN));
}
-consume:
return SCTP_DISPOSITION_CONSUME;
}
@@ -4825,11 +4821,9 @@ sctp_disposition_t sctp_sf_do_9_1_prm_abort(
* if necessary to fill gaps.
*/
struct sctp_chunk *abort = arg;
- sctp_disposition_t retval;
- retval = SCTP_DISPOSITION_CONSUME;
-
- sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+ if (abort)
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
/* Even if we can't send the ABORT due to low memory delete the
* TCB. This is a departure from our typical NOMEM handling.
@@ -4844,7 +4838,7 @@ sctp_disposition_t sctp_sf_do_9_1_prm_abort(
SCTP_INC_STATS(net, SCTP_MIB_ABORTEDS);
SCTP_DEC_STATS(net, SCTP_MIB_CURRESTAB);
- return retval;
+ return SCTP_DISPOSITION_ABORT;
}
/* We tried an illegal operation on an association which is closed. */
@@ -4959,14 +4953,13 @@ sctp_disposition_t sctp_sf_cookie_wait_prm_abort(
sctp_cmd_seq_t *commands)
{
struct sctp_chunk *abort = arg;
- sctp_disposition_t retval;
/* Stop T1-init timer */
sctp_add_cmd_sf(commands, SCTP_CMD_TIMER_STOP,
SCTP_TO(SCTP_EVENT_TIMEOUT_T1_INIT));
- retval = SCTP_DISPOSITION_CONSUME;
- sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
+ if (abort)
+ sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(abort));
sctp_add_cmd_sf(commands, SCTP_CMD_NEW_STATE,
SCTP_STATE(SCTP_STATE_CLOSED));
@@ -4983,7 +4976,7 @@ sctp_disposition_t sctp_sf_cookie_wait_prm_abort(
sctp_add_cmd_sf(commands, SCTP_CMD_INIT_FAILED,
SCTP_PERR(SCTP_ERROR_USER_ABORT));
- return retval;
+ return SCTP_DISPOSITION_ABORT;
}
/*
@@ -5412,7 +5405,8 @@ sctp_disposition_t sctp_sf_do_6_3_3_rtx(struct net *net,
SCTP_INC_STATS(net, SCTP_MIB_T3_RTX_EXPIREDS);
if (asoc->overall_error_count >= asoc->max_retrans) {
- if (asoc->state == SCTP_STATE_SHUTDOWN_PENDING) {
+ if (asoc->peer.zero_window_announced &&
+ asoc->state == SCTP_STATE_SHUTDOWN_PENDING) {
/*
* We are here likely because the receiver had its rwnd
* closed for a while and we have not been able to
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 897c01c029ca..878d28eda1a6 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -52,6 +52,7 @@
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+#include <crypto/hash.h>
#include <linux/types.h>
#include <linux/kernel.h>
#include <linux/wait.h>
@@ -61,7 +62,6 @@
#include <linux/fcntl.h>
#include <linux/poll.h>
#include <linux/init.h>
-#include <linux/crypto.h>
#include <linux/slab.h>
#include <linux/file.h>
#include <linux/compat.h>
@@ -386,7 +386,8 @@ static int sctp_do_bind(struct sock *sk, union sctp_addr *addr, int len)
/* Add the address to the bind address list.
* Use GFP_ATOMIC since BHs will be disabled.
*/
- ret = sctp_add_bind_addr(bp, addr, SCTP_ADDR_SRC, GFP_ATOMIC);
+ ret = sctp_add_bind_addr(bp, addr, af->sockaddr_len,
+ SCTP_ADDR_SRC, GFP_ATOMIC);
/* Copy back into socket for getsockname() use. */
if (!ret) {
@@ -577,6 +578,7 @@ static int sctp_send_asconf_add_ip(struct sock *sk,
af = sctp_get_af_specific(addr->v4.sin_family);
memcpy(&saveaddr, addr, af->sockaddr_len);
retval = sctp_add_bind_addr(bp, &saveaddr,
+ sizeof(saveaddr),
SCTP_ADDR_NEW, GFP_ATOMIC);
addr_buf += af->sockaddr_len;
}
@@ -972,7 +974,7 @@ static int sctp_setsockopt_bindx(struct sock *sk,
return -EFAULT;
/* Alloc space for the address array in kernel memory. */
- kaddrs = kmalloc(addrs_size, GFP_KERNEL);
+ kaddrs = kmalloc(addrs_size, GFP_USER | __GFP_NOWARN);
if (unlikely(!kaddrs))
return -ENOMEM;
@@ -1228,7 +1230,6 @@ out_free:
* To the hash table, try to unhash it, just in case, its a noop
* if it wasn't hashed so we're safe
*/
- sctp_unhash_established(asoc);
sctp_association_free(asoc);
}
return err;
@@ -1301,8 +1302,9 @@ static int __sctp_setsockopt_connectx(struct sock *sk,
int addrs_size,
sctp_assoc_t *assoc_id)
{
- int err = 0;
struct sockaddr *kaddrs;
+ gfp_t gfp = GFP_KERNEL;
+ int err = 0;
pr_debug("%s: sk:%p addrs:%p addrs_size:%d\n",
__func__, sk, addrs, addrs_size);
@@ -1315,7 +1317,9 @@ static int __sctp_setsockopt_connectx(struct sock *sk,
return -EFAULT;
/* Alloc space for the address array in kernel memory. */
- kaddrs = kmalloc(addrs_size, GFP_KERNEL);
+ if (sk->sk_socket->file)
+ gfp = GFP_USER | __GFP_NOWARN;
+ kaddrs = kmalloc(addrs_size, gfp);
if (unlikely(!kaddrs))
return -ENOMEM;
@@ -1387,7 +1391,7 @@ static int sctp_getsockopt_connectx3(struct sock *sk, int len,
int err = 0;
#ifdef CONFIG_COMPAT
- if (is_compat_task()) {
+ if (in_compat_syscall()) {
struct compat_sctp_getaddrs_old param32;
if (len < sizeof(param32))
@@ -1501,7 +1505,6 @@ static void sctp_close(struct sock *sk, long timeout)
* ABORT or SHUTDOWN based on the linger options.
*/
if (sctp_state(asoc, CLOSED)) {
- sctp_unhash_established(asoc);
sctp_association_free(asoc);
continue;
}
@@ -1513,8 +1516,7 @@ static void sctp_close(struct sock *sk, long timeout)
struct sctp_chunk *chunk;
chunk = sctp_make_abort_user(asoc, NULL, 0);
- if (chunk)
- sctp_primitive_ABORT(net, asoc, chunk);
+ sctp_primitive_ABORT(net, asoc, chunk);
} else
sctp_primitive_SHUTDOWN(net, asoc, NULL);
}
@@ -1952,8 +1954,6 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
/* Now send the (possibly) fragmented message. */
list_for_each_entry(chunk, &datamsg->chunks, frag_list) {
- sctp_chunk_hold(chunk);
-
/* Do accounting for the write space. */
sctp_set_owner_w(chunk);
@@ -1966,15 +1966,13 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
* breaks.
*/
err = sctp_primitive_SEND(net, asoc, datamsg);
+ sctp_datamsg_put(datamsg);
/* Did the lower layer accept the chunk? */
- if (err) {
- sctp_datamsg_free(datamsg);
+ if (err)
goto out_free;
- }
pr_debug("%s: we sent primitively\n", __func__);
- sctp_datamsg_put(datamsg);
err = msg_len;
if (unlikely(wait_connect)) {
@@ -1988,10 +1986,8 @@ static int sctp_sendmsg(struct sock *sk, struct msghdr *msg, size_t msg_len)
goto out_unlock;
out_free:
- if (new_asoc) {
- sctp_unhash_established(asoc);
+ if (new_asoc)
sctp_association_free(asoc);
- }
out_unlock:
release_sock(sk);
@@ -4166,7 +4162,7 @@ static void sctp_destruct_sock(struct sock *sk)
struct sctp_sock *sp = sctp_sk(sk);
/* Free up the HMAC transform. */
- crypto_free_hash(sp->hmac);
+ crypto_free_shash(sp->hmac);
inet_sock_destruct(sk);
}
@@ -4928,7 +4924,7 @@ static int sctp_getsockopt_local_addrs(struct sock *sk, int len,
to = optval + offsetof(struct sctp_getaddrs, addrs);
space_left = len - offsetof(struct sctp_getaddrs, addrs);
- addrs = kmalloc(space_left, GFP_KERNEL);
+ addrs = kmalloc(space_left, GFP_USER | __GFP_NOWARN);
if (!addrs)
return -ENOMEM;
@@ -5544,6 +5540,7 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len,
struct sctp_hmac_algo_param *hmacs;
__u16 data_len = 0;
u32 num_idents;
+ int i;
if (!ep->auth_enable)
return -EACCES;
@@ -5561,8 +5558,12 @@ static int sctp_getsockopt_hmac_ident(struct sock *sk, int len,
return -EFAULT;
if (put_user(num_idents, &p->shmac_num_idents))
return -EFAULT;
- if (copy_to_user(p->shmac_idents, hmacs->hmac_ids, data_len))
- return -EFAULT;
+ for (i = 0; i < num_idents; i++) {
+ __u16 hmacid = ntohs(hmacs->hmac_ids[i]);
+
+ if (copy_to_user(&p->shmac_idents[i], &hmacid, sizeof(__u16)))
+ return -EFAULT;
+ }
return 0;
}
@@ -5777,7 +5778,7 @@ static int sctp_getsockopt_assoc_ids(struct sock *sk, int len,
len = sizeof(struct sctp_assoc_ids) + sizeof(sctp_assoc_t) * num;
- ids = kmalloc(len, GFP_KERNEL);
+ ids = kmalloc(len, GFP_USER | __GFP_NOWARN);
if (unlikely(!ids))
return -ENOMEM;
@@ -6107,9 +6108,10 @@ static int sctp_getsockopt(struct sock *sk, int level, int optname,
return retval;
}
-static void sctp_hash(struct sock *sk)
+static int sctp_hash(struct sock *sk)
{
/* STUB */
+ return 0;
}
static void sctp_unhash(struct sock *sk)
@@ -6305,13 +6307,13 @@ static int sctp_listen_start(struct sock *sk, int backlog)
{
struct sctp_sock *sp = sctp_sk(sk);
struct sctp_endpoint *ep = sp->ep;
- struct crypto_hash *tfm = NULL;
+ struct crypto_shash *tfm = NULL;
char alg[32];
/* Allocate HMAC for generating cookie. */
if (!sp->hmac && sp->sctp_hmac_alg) {
sprintf(alg, "hmac(%s)", sp->sctp_hmac_alg);
- tfm = crypto_alloc_hash(alg, 0, CRYPTO_ALG_ASYNC);
+ tfm = crypto_alloc_shash(alg, 0, 0);
if (IS_ERR(tfm)) {
net_info_ratelimited("failed to load transform for %s: %ld\n",
sp->sctp_hmac_alg, PTR_ERR(tfm));
@@ -6458,7 +6460,7 @@ unsigned int sctp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (sctp_writeable(sk)) {
mask |= POLLOUT | POLLWRNORM;
} else {
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
/*
* Since the socket is not locked, the buffer
* might be made available after the writeable check and
@@ -6642,6 +6644,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs)
if (cmsgs->srinfo->sinfo_flags &
~(SCTP_UNORDERED | SCTP_ADDR_OVER |
+ SCTP_SACK_IMMEDIATELY |
SCTP_ABORT | SCTP_EOF))
return -EINVAL;
break;
@@ -6665,6 +6668,7 @@ static int sctp_msghdr_parse(const struct msghdr *msg, sctp_cmsgs_t *cmsgs)
if (cmsgs->sinfo->snd_flags &
~(SCTP_UNORDERED | SCTP_ADDR_OVER |
+ SCTP_SACK_IMMEDIATELY |
SCTP_ABORT | SCTP_EOF))
return -EINVAL;
break;
@@ -6801,26 +6805,30 @@ no_packet:
static void __sctp_write_space(struct sctp_association *asoc)
{
struct sock *sk = asoc->base.sk;
- struct socket *sock = sk->sk_socket;
- if ((sctp_wspace(asoc) > 0) && sock) {
- if (waitqueue_active(&asoc->wait))
- wake_up_interruptible(&asoc->wait);
+ if (sctp_wspace(asoc) <= 0)
+ return;
- if (sctp_writeable(sk)) {
- wait_queue_head_t *wq = sk_sleep(sk);
+ if (waitqueue_active(&asoc->wait))
+ wake_up_interruptible(&asoc->wait);
- if (wq && waitqueue_active(wq))
- wake_up_interruptible(wq);
+ if (sctp_writeable(sk)) {
+ struct socket_wq *wq;
+
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (wq) {
+ if (waitqueue_active(&wq->wait))
+ wake_up_interruptible(&wq->wait);
/* Note that we try to include the Async I/O support
* here by modeling from the current TCP/UDP code.
* We have not tested with it yet.
*/
if (!(sk->sk_shutdown & SEND_SHUTDOWN))
- sock_wake_async(sock,
- SOCK_WAKE_SPACE, POLL_OUT);
+ sock_wake_async(wq, SOCK_WAKE_SPACE, POLL_OUT);
}
+ rcu_read_unlock();
}
}
@@ -6978,7 +6986,7 @@ void sctp_data_ready(struct sock *sk)
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
POLLRDNORM | POLLRDBAND);
sk_wake_async(sk, SOCK_WAKE_WAITD, POLL_IN);
@@ -7163,6 +7171,7 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
newsk->sk_type = sk->sk_type;
newsk->sk_bound_dev_if = sk->sk_bound_dev_if;
newsk->sk_flags = sk->sk_flags;
+ newsk->sk_tsflags = sk->sk_tsflags;
newsk->sk_no_check_tx = sk->sk_no_check_tx;
newsk->sk_no_check_rx = sk->sk_no_check_rx;
newsk->sk_reuse = sk->sk_reuse;
@@ -7195,6 +7204,11 @@ void sctp_copy_sock(struct sock *newsk, struct sock *sk,
newinet->mc_ttl = 1;
newinet->mc_index = 0;
newinet->mc_list = NULL;
+
+ if (newsk->sk_flags & SK_FLAGS_TIMESTAMP)
+ net_enable_timestamp();
+
+ security_sk_clone(sk, newsk);
}
static inline void sctp_copy_descendant(struct sock *sk_to,
@@ -7242,14 +7256,12 @@ static void sctp_sock_migrate(struct sock *oldsk, struct sock *newsk,
/* Hook this new socket in to the bind_hash list. */
head = &sctp_port_hashtable[sctp_phashfn(sock_net(oldsk),
inet_sk(oldsk)->inet_num)];
- local_bh_disable();
- spin_lock(&head->lock);
+ spin_lock_bh(&head->lock);
pp = sctp_sk(oldsk)->bind_hash;
sk_add_bind_node(newsk, &pp->owner);
sctp_sk(newsk)->bind_hash = pp;
inet_sk(newsk)->inet_num = inet_sk(oldsk)->inet_num;
- spin_unlock(&head->lock);
- local_bh_enable();
+ spin_unlock_bh(&head->lock);
/* Copy the bind_addr list from the original endpoint to the new
* endpoint so that we can handle restarts properly
@@ -7375,6 +7387,13 @@ struct proto sctp_prot = {
#if IS_ENABLED(CONFIG_IPV6)
+#include <net/transp_v6.h>
+static void sctp_v6_destroy_sock(struct sock *sk)
+{
+ sctp_destroy_sock(sk);
+ inet6_destroy_sock(sk);
+}
+
struct proto sctpv6_prot = {
.name = "SCTPv6",
.owner = THIS_MODULE,
@@ -7384,7 +7403,7 @@ struct proto sctpv6_prot = {
.accept = sctp_accept,
.ioctl = sctp_ioctl,
.init = sctp_init_sock,
- .destroy = sctp_destroy_sock,
+ .destroy = sctp_v6_destroy_sock,
.shutdown = sctp_shutdown,
.setsockopt = sctp_setsockopt,
.getsockopt = sctp_getsockopt,
diff --git a/net/sctp/sysctl.c b/net/sctp/sysctl.c
index 26d50c565f54..daf8554fd42a 100644
--- a/net/sctp/sysctl.c
+++ b/net/sctp/sysctl.c
@@ -308,6 +308,13 @@ static struct ctl_table sctp_net_table[] = {
.extra1 = &max_autoclose_min,
.extra2 = &max_autoclose_max,
},
+ {
+ .procname = "pf_enable",
+ .data = &init_net.sctp.pf_enable,
+ .maxlen = sizeof(int),
+ .mode = 0644,
+ .proc_handler = proc_dointvec,
+ },
{ /* sentinel */ }
};
@@ -320,7 +327,7 @@ static int proc_sctp_do_hmac_alg(struct ctl_table *ctl, int write,
struct ctl_table tbl;
bool changed = false;
char *none = "none";
- char tmp[8];
+ char tmp[8] = {0};
int ret;
memset(&tbl, 0, sizeof(struct ctl_table));
diff --git a/net/sctp/transport.c b/net/sctp/transport.c
index a0a431824f63..9b6b48c7524e 100644
--- a/net/sctp/transport.c
+++ b/net/sctp/transport.c
@@ -72,7 +72,7 @@ static struct sctp_transport *sctp_transport_init(struct net *net,
*/
peer->rto = msecs_to_jiffies(net->sctp.rto_initial);
- peer->last_time_heard = ktime_get();
+ peer->last_time_heard = ktime_set(0, 0);
peer->last_time_ecne_reduced = jiffies;
peer->param_flags = SPP_HB_DISABLE |
@@ -132,8 +132,6 @@ fail:
*/
void sctp_transport_free(struct sctp_transport *transport)
{
- transport->dead = 1;
-
/* Try to delete the heartbeat timer. */
if (del_timer(&transport->hb_timer))
sctp_transport_put(transport);
@@ -169,7 +167,7 @@ static void sctp_transport_destroy_rcu(struct rcu_head *head)
*/
static void sctp_transport_destroy(struct sctp_transport *transport)
{
- if (unlikely(!transport->dead)) {
+ if (unlikely(atomic_read(&transport->refcnt))) {
WARN(1, "Attempt to destroy undead transport %p!\n", transport);
return;
}
@@ -228,7 +226,7 @@ void sctp_transport_pmtu(struct sctp_transport *transport, struct sock *sk)
}
if (transport->dst) {
- transport->pathmtu = dst_mtu(transport->dst);
+ transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst));
} else
transport->pathmtu = SCTP_DEFAULT_MAXSEGMENT;
}
@@ -282,7 +280,7 @@ void sctp_transport_route(struct sctp_transport *transport,
return;
}
if (transport->dst) {
- transport->pathmtu = dst_mtu(transport->dst);
+ transport->pathmtu = WORD_TRUNC(dst_mtu(transport->dst));
/* Initialize sk->sk_rcv_saddr, if the transport is the
* association's active path for getsockname().
@@ -296,9 +294,9 @@ void sctp_transport_route(struct sctp_transport *transport,
}
/* Hold a reference to a transport. */
-void sctp_transport_hold(struct sctp_transport *transport)
+int sctp_transport_hold(struct sctp_transport *transport)
{
- atomic_inc(&transport->refcnt);
+ return atomic_add_unless(&transport->refcnt, 1, 0);
}
/* Release a reference to a transport and clean up
@@ -331,7 +329,7 @@ void sctp_transport_update_rto(struct sctp_transport *tp, __u32 rtt)
* 1/8, rto_alpha would be expressed as 3.
*/
tp->rttvar = tp->rttvar - (tp->rttvar >> net->sctp.rto_beta)
- + (((__u32)abs64((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta);
+ + (((__u32)abs((__s64)tp->srtt - (__s64)rtt)) >> net->sctp.rto_beta);
tp->srtt = tp->srtt - (tp->srtt >> net->sctp.rto_alpha)
+ (rtt >> net->sctp.rto_alpha);
} else {
diff --git a/net/socket.c b/net/socket.c
index dd2c247c99e3..5f77a8e93830 100644
--- a/net/socket.c
+++ b/net/socket.c
@@ -257,6 +257,7 @@ static struct inode *sock_alloc_inode(struct super_block *sb)
}
init_waitqueue_head(&wq->wait);
wq->fasync_list = NULL;
+ wq->flags = 0;
RCU_INIT_POINTER(ei->socket.wq, wq);
ei->socket.state = SS_UNCONNECTED;
@@ -293,7 +294,7 @@ static int init_inodecache(void)
0,
(SLAB_HWCACHE_ALIGN |
SLAB_RECLAIM_ACCOUNT |
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD | SLAB_ACCOUNT),
init_once);
if (sock_inode_cachep == NULL)
return -ENOMEM;
@@ -532,7 +533,7 @@ static const struct inode_operations sockfs_inode_ops = {
* NULL is returned.
*/
-static struct socket *sock_alloc(void)
+struct socket *sock_alloc(void)
{
struct inode *inode;
struct socket *sock;
@@ -553,6 +554,7 @@ static struct socket *sock_alloc(void)
this_cpu_add(sockets_in_use, 1);
return sock;
}
+EXPORT_SYMBOL(sock_alloc);
/**
* sock_release - close a socket
@@ -1056,27 +1058,20 @@ static int sock_fasync(int fd, struct file *filp, int on)
return 0;
}
-/* This function may be called only under socket lock or callback_lock or rcu_lock */
+/* This function may be called only under rcu_lock */
-int sock_wake_async(struct socket *sock, int how, int band)
+int sock_wake_async(struct socket_wq *wq, int how, int band)
{
- struct socket_wq *wq;
-
- if (!sock)
- return -1;
- rcu_read_lock();
- wq = rcu_dereference(sock->wq);
- if (!wq || !wq->fasync_list) {
- rcu_read_unlock();
+ if (!wq || !wq->fasync_list)
return -1;
- }
+
switch (how) {
case SOCK_WAKE_WAITD:
- if (test_bit(SOCK_ASYNC_WAITDATA, &sock->flags))
+ if (test_bit(SOCKWQ_ASYNC_WAITDATA, &wq->flags))
break;
goto call_kill;
case SOCK_WAKE_SPACE:
- if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags))
+ if (!test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags))
break;
/* fall through */
case SOCK_WAKE_IO:
@@ -1086,7 +1081,7 @@ call_kill:
case SOCK_WAKE_URG:
kill_fasync(&wq->fasync_list, SIGURG, band);
}
- rcu_read_unlock();
+
return 0;
}
EXPORT_SYMBOL(sock_wake_async);
@@ -1112,12 +1107,8 @@ int __sock_create(struct net *net, int family, int type, int protocol,
deadlock in module load.
*/
if (family == PF_INET && type == SOCK_PACKET) {
- static int warned;
- if (!warned) {
- warned = 1;
- pr_info("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
- current->comm);
- }
+ pr_info_once("%s uses obsolete (PF_INET,SOCK_PACKET)\n",
+ current->comm);
family = PF_PACKET;
}
@@ -1702,6 +1693,7 @@ SYSCALL_DEFINE6(recvfrom, int, fd, void __user *, ubuf, size_t, size,
msg.msg_name = addr ? (struct sockaddr *)&address : NULL;
/* We assume all kernel code knows the size of sockaddr_storage */
msg.msg_namelen = 0;
+ msg.msg_iocb = NULL;
if (sock->file->f_flags & O_NONBLOCK)
flags |= MSG_DONTWAIT;
err = sock_recvmsg(sock, &msg, iov_iter_count(&msg.msg_iter), flags);
@@ -1879,7 +1871,8 @@ static int copy_msghdr_from_user(struct msghdr *kmsg,
static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
struct msghdr *msg_sys, unsigned int flags,
- struct used_address *used_address)
+ struct used_address *used_address,
+ unsigned int allowed_msghdr_flags)
{
struct compat_msghdr __user *msg_compat =
(struct compat_msghdr __user *)msg;
@@ -1905,6 +1898,7 @@ static int ___sys_sendmsg(struct socket *sock, struct user_msghdr __user *msg,
if (msg_sys->msg_controllen > INT_MAX)
goto out_freeiov;
+ flags |= (msg_sys->msg_flags & allowed_msghdr_flags);
ctl_len = msg_sys->msg_controllen;
if ((MSG_CMSG_COMPAT & flags) && ctl_len) {
err =
@@ -1983,7 +1977,7 @@ long __sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags)
if (!sock)
goto out;
- err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL);
+ err = ___sys_sendmsg(sock, msg, &msg_sys, flags, NULL, 0);
fput_light(sock->file, fput_needed);
out:
@@ -2010,6 +2004,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
struct compat_mmsghdr __user *compat_entry;
struct msghdr msg_sys;
struct used_address used_address;
+ unsigned int oflags = flags;
if (vlen > UIO_MAXIOV)
vlen = UIO_MAXIOV;
@@ -2024,11 +2019,15 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
entry = mmsg;
compat_entry = (struct compat_mmsghdr __user *)mmsg;
err = 0;
+ flags |= MSG_BATCH;
while (datagrams < vlen) {
+ if (datagrams == vlen - 1)
+ flags = oflags;
+
if (MSG_CMSG_COMPAT & flags) {
err = ___sys_sendmsg(sock, (struct user_msghdr __user *)compat_entry,
- &msg_sys, flags, &used_address);
+ &msg_sys, flags, &used_address, MSG_EOR);
if (err < 0)
break;
err = __put_user(err, &compat_entry->msg_len);
@@ -2036,7 +2035,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
} else {
err = ___sys_sendmsg(sock,
(struct user_msghdr __user *)entry,
- &msg_sys, flags, &used_address);
+ &msg_sys, flags, &used_address, MSG_EOR);
if (err < 0)
break;
err = put_user(err, &entry->msg_len);
@@ -2046,6 +2045,7 @@ int __sys_sendmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
if (err)
break;
++datagrams;
+ cond_resched();
}
fput_light(sock->file, fput_needed);
@@ -2241,33 +2241,34 @@ int __sys_recvmmsg(int fd, struct mmsghdr __user *mmsg, unsigned int vlen,
/* Out of band data, return right away */
if (msg_sys.msg_flags & MSG_OOB)
break;
+ cond_resched();
}
-out_put:
- fput_light(sock->file, fput_needed);
-
if (err == 0)
- return datagrams;
+ goto out_put;
+
+ if (datagrams == 0) {
+ datagrams = err;
+ goto out_put;
+ }
- if (datagrams != 0) {
+ /*
+ * We may return less entries than requested (vlen) if the
+ * sock is non block and there aren't enough datagrams...
+ */
+ if (err != -EAGAIN) {
/*
- * We may return less entries than requested (vlen) if the
- * sock is non block and there aren't enough datagrams...
+ * ... or if recvmsg returns an error after we
+ * received some datagrams, where we record the
+ * error to return on the next call or if the
+ * app asks about it using getsockopt(SO_ERROR).
*/
- if (err != -EAGAIN) {
- /*
- * ... or if recvmsg returns an error after we
- * received some datagrams, where we record the
- * error to return on the next call or if the
- * app asks about it using getsockopt(SO_ERROR).
- */
- sock->sk->sk_err = -err;
- }
-
- return datagrams;
+ sock->sk->sk_err = -err;
}
+out_put:
+ fput_light(sock->file, fput_needed);
- return err;
+ return datagrams;
}
SYSCALL_DEFINE5(recvmmsg, int, fd, struct mmsghdr __user *, mmsg,
diff --git a/net/sunrpc/Makefile b/net/sunrpc/Makefile
index b512fbd9d79a..ea7ffa12e0f9 100644
--- a/net/sunrpc/Makefile
+++ b/net/sunrpc/Makefile
@@ -12,7 +12,8 @@ sunrpc-y := clnt.o xprt.o socklib.o xprtsock.o sched.o \
svc.o svcsock.o svcauth.o svcauth_unix.o \
addr.o rpcb_clnt.o timer.o xdr.o \
sunrpc_syms.o cache.o rpc_pipe.o \
- svc_xprt.o
+ svc_xprt.o \
+ xprtmultipath.o
sunrpc-$(CONFIG_SUNRPC_DEBUG) += debugfs.o
sunrpc-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel_rqst.o
sunrpc-$(CONFIG_PROC_FS) += stats.o
diff --git a/net/sunrpc/auth_gss/auth_gss.c b/net/sunrpc/auth_gss/auth_gss.c
index dace13d7638e..15612ffa8d57 100644
--- a/net/sunrpc/auth_gss/auth_gss.c
+++ b/net/sunrpc/auth_gss/auth_gss.c
@@ -740,7 +740,7 @@ gss_pipe_downcall(struct file *filp, const char __user *src, size_t mlen)
default:
printk(KERN_CRIT "%s: bad return from "
"gss_fill_context: %zd\n", __func__, err);
- BUG();
+ gss_msg->msg.errno = -EIO;
}
goto err_release_msg;
}
@@ -1181,12 +1181,12 @@ static struct rpc_auth *
gss_create(struct rpc_auth_create_args *args, struct rpc_clnt *clnt)
{
struct gss_auth *gss_auth;
- struct rpc_xprt *xprt = rcu_access_pointer(clnt->cl_xprt);
+ struct rpc_xprt_switch *xps = rcu_access_pointer(clnt->cl_xpi.xpi_xpswitch);
while (clnt != clnt->cl_parent) {
struct rpc_clnt *parent = clnt->cl_parent;
/* Find the original parent for this transport */
- if (rcu_access_pointer(parent->cl_xprt) != xprt)
+ if (rcu_access_pointer(parent->cl_xpi.xpi_xpswitch) != xps)
break;
clnt = parent;
}
@@ -1411,17 +1411,16 @@ gss_key_timeout(struct rpc_cred *rc)
{
struct gss_cred *gss_cred = container_of(rc, struct gss_cred, gc_base);
struct gss_cl_ctx *ctx;
- unsigned long now = jiffies;
- unsigned long expire;
+ unsigned long timeout = jiffies + (gss_key_expire_timeo * HZ);
+ int ret = 0;
rcu_read_lock();
ctx = rcu_dereference(gss_cred->gc_ctx);
- if (ctx)
- expire = ctx->gc_expiry - (gss_key_expire_timeo * HZ);
+ if (!ctx || time_after(timeout, ctx->gc_expiry))
+ ret = -EACCES;
rcu_read_unlock();
- if (!ctx || time_after(now, expire))
- return -EACCES;
- return 0;
+
+ return ret;
}
static int
@@ -1729,8 +1728,8 @@ alloc_enc_pages(struct rpc_rqst *rqstp)
return 0;
}
- first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
- last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_CACHE_SHIFT;
+ first = snd_buf->page_base >> PAGE_SHIFT;
+ last = (snd_buf->page_base + snd_buf->page_len - 1) >> PAGE_SHIFT;
rqstp->rq_enc_pages_num = last - first + 1 + 1;
rqstp->rq_enc_pages
= kmalloc(rqstp->rq_enc_pages_num * sizeof(struct page *),
@@ -1776,10 +1775,10 @@ gss_wrap_req_priv(struct rpc_cred *cred, struct gss_cl_ctx *ctx,
status = alloc_enc_pages(rqstp);
if (status)
return status;
- first = snd_buf->page_base >> PAGE_CACHE_SHIFT;
+ first = snd_buf->page_base >> PAGE_SHIFT;
inpages = snd_buf->pages + first;
snd_buf->pages = rqstp->rq_enc_pages;
- snd_buf->page_base -= first << PAGE_CACHE_SHIFT;
+ snd_buf->page_base -= first << PAGE_SHIFT;
/*
* Give the tail its own page, in case we need extra space in the
* head when wrapping:
diff --git a/net/sunrpc/auth_gss/gss_krb5_crypto.c b/net/sunrpc/auth_gss/gss_krb5_crypto.c
index fee3c15a4b52..244245bcbbd2 100644
--- a/net/sunrpc/auth_gss/gss_krb5_crypto.c
+++ b/net/sunrpc/auth_gss/gss_krb5_crypto.c
@@ -34,11 +34,12 @@
* WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
*/
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
#include <linux/err.h>
#include <linux/types.h>
#include <linux/mm.h>
#include <linux/scatterlist.h>
-#include <linux/crypto.h>
#include <linux/highmem.h>
#include <linux/pagemap.h>
#include <linux/random.h>
@@ -51,7 +52,7 @@
u32
krb5_encrypt(
- struct crypto_blkcipher *tfm,
+ struct crypto_skcipher *tfm,
void * iv,
void * in,
void * out,
@@ -60,24 +61,29 @@ krb5_encrypt(
u32 ret = -EINVAL;
struct scatterlist sg[1];
u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0};
- struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv };
+ SKCIPHER_REQUEST_ON_STACK(req, tfm);
- if (length % crypto_blkcipher_blocksize(tfm) != 0)
+ if (length % crypto_skcipher_blocksize(tfm) != 0)
goto out;
- if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
+ if (crypto_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
dprintk("RPC: gss_k5encrypt: tfm iv size too large %d\n",
- crypto_blkcipher_ivsize(tfm));
+ crypto_skcipher_ivsize(tfm));
goto out;
}
if (iv)
- memcpy(local_iv, iv, crypto_blkcipher_ivsize(tfm));
+ memcpy(local_iv, iv, crypto_skcipher_ivsize(tfm));
memcpy(out, in, length);
sg_init_one(sg, out, length);
- ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, length);
+ skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, sg, sg, length, local_iv);
+
+ ret = crypto_skcipher_encrypt(req);
+ skcipher_request_zero(req);
out:
dprintk("RPC: krb5_encrypt returns %d\n", ret);
return ret;
@@ -85,7 +91,7 @@ out:
u32
krb5_decrypt(
- struct crypto_blkcipher *tfm,
+ struct crypto_skcipher *tfm,
void * iv,
void * in,
void * out,
@@ -94,23 +100,28 @@ krb5_decrypt(
u32 ret = -EINVAL;
struct scatterlist sg[1];
u8 local_iv[GSS_KRB5_MAX_BLOCKSIZE] = {0};
- struct blkcipher_desc desc = { .tfm = tfm, .info = local_iv };
+ SKCIPHER_REQUEST_ON_STACK(req, tfm);
- if (length % crypto_blkcipher_blocksize(tfm) != 0)
+ if (length % crypto_skcipher_blocksize(tfm) != 0)
goto out;
- if (crypto_blkcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
+ if (crypto_skcipher_ivsize(tfm) > GSS_KRB5_MAX_BLOCKSIZE) {
dprintk("RPC: gss_k5decrypt: tfm iv size too large %d\n",
- crypto_blkcipher_ivsize(tfm));
+ crypto_skcipher_ivsize(tfm));
goto out;
}
if (iv)
- memcpy(local_iv,iv, crypto_blkcipher_ivsize(tfm));
+ memcpy(local_iv,iv, crypto_skcipher_ivsize(tfm));
memcpy(out, in, length);
sg_init_one(sg, out, length);
- ret = crypto_blkcipher_decrypt_iv(&desc, sg, sg, length);
+ skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, sg, sg, length, local_iv);
+
+ ret = crypto_skcipher_decrypt(req);
+ skcipher_request_zero(req);
out:
dprintk("RPC: gss_k5decrypt returns %d\n",ret);
return ret;
@@ -119,9 +130,11 @@ out:
static int
checksummer(struct scatterlist *sg, void *data)
{
- struct hash_desc *desc = data;
+ struct ahash_request *req = data;
+
+ ahash_request_set_crypt(req, sg, NULL, sg->length);
- return crypto_hash_update(desc, sg, sg->length);
+ return crypto_ahash_update(req);
}
static int
@@ -152,13 +165,13 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
struct xdr_buf *body, int body_offset, u8 *cksumkey,
unsigned int usage, struct xdr_netobj *cksumout)
{
- struct hash_desc desc;
struct scatterlist sg[1];
int err;
u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
u8 rc4salt[4];
- struct crypto_hash *md5;
- struct crypto_hash *hmac_md5;
+ struct crypto_ahash *md5;
+ struct crypto_ahash *hmac_md5;
+ struct ahash_request *req;
if (cksumkey == NULL)
return GSS_S_FAILURE;
@@ -174,61 +187,79 @@ make_checksum_hmac_md5(struct krb5_ctx *kctx, char *header, int hdrlen,
return GSS_S_FAILURE;
}
- md5 = crypto_alloc_hash("md5", 0, CRYPTO_ALG_ASYNC);
+ md5 = crypto_alloc_ahash("md5", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(md5))
return GSS_S_FAILURE;
- hmac_md5 = crypto_alloc_hash(kctx->gk5e->cksum_name, 0,
- CRYPTO_ALG_ASYNC);
+ hmac_md5 = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0,
+ CRYPTO_ALG_ASYNC);
if (IS_ERR(hmac_md5)) {
- crypto_free_hash(md5);
+ crypto_free_ahash(md5);
+ return GSS_S_FAILURE;
+ }
+
+ req = ahash_request_alloc(md5, GFP_KERNEL);
+ if (!req) {
+ crypto_free_ahash(hmac_md5);
+ crypto_free_ahash(md5);
return GSS_S_FAILURE;
}
- desc.tfm = md5;
- desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
- err = crypto_hash_init(&desc);
+ err = crypto_ahash_init(req);
if (err)
goto out;
sg_init_one(sg, rc4salt, 4);
- err = crypto_hash_update(&desc, sg, 4);
+ ahash_request_set_crypt(req, sg, NULL, 4);
+ err = crypto_ahash_update(req);
if (err)
goto out;
sg_init_one(sg, header, hdrlen);
- err = crypto_hash_update(&desc, sg, hdrlen);
+ ahash_request_set_crypt(req, sg, NULL, hdrlen);
+ err = crypto_ahash_update(req);
if (err)
goto out;
err = xdr_process_buf(body, body_offset, body->len - body_offset,
- checksummer, &desc);
+ checksummer, req);
if (err)
goto out;
- err = crypto_hash_final(&desc, checksumdata);
+ ahash_request_set_crypt(req, NULL, checksumdata, 0);
+ err = crypto_ahash_final(req);
if (err)
goto out;
- desc.tfm = hmac_md5;
- desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ ahash_request_free(req);
+ req = ahash_request_alloc(hmac_md5, GFP_KERNEL);
+ if (!req) {
+ crypto_free_ahash(hmac_md5);
+ crypto_free_ahash(md5);
+ return GSS_S_FAILURE;
+ }
+
+ ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
- err = crypto_hash_init(&desc);
+ err = crypto_ahash_init(req);
if (err)
goto out;
- err = crypto_hash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength);
+ err = crypto_ahash_setkey(hmac_md5, cksumkey, kctx->gk5e->keylength);
if (err)
goto out;
- sg_init_one(sg, checksumdata, crypto_hash_digestsize(md5));
- err = crypto_hash_digest(&desc, sg, crypto_hash_digestsize(md5),
- checksumdata);
+ sg_init_one(sg, checksumdata, crypto_ahash_digestsize(md5));
+ ahash_request_set_crypt(req, sg, checksumdata,
+ crypto_ahash_digestsize(md5));
+ err = crypto_ahash_digest(req);
if (err)
goto out;
memcpy(cksumout->data, checksumdata, kctx->gk5e->cksumlength);
cksumout->len = kctx->gk5e->cksumlength;
out:
- crypto_free_hash(md5);
- crypto_free_hash(hmac_md5);
+ ahash_request_free(req);
+ crypto_free_ahash(md5);
+ crypto_free_ahash(hmac_md5);
return err ? GSS_S_FAILURE : 0;
}
@@ -242,7 +273,8 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
struct xdr_buf *body, int body_offset, u8 *cksumkey,
unsigned int usage, struct xdr_netobj *cksumout)
{
- struct hash_desc desc;
+ struct crypto_ahash *tfm;
+ struct ahash_request *req;
struct scatterlist sg[1];
int err;
u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
@@ -259,32 +291,41 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
return GSS_S_FAILURE;
}
- desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
- if (IS_ERR(desc.tfm))
+ tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
return GSS_S_FAILURE;
- desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
- checksumlen = crypto_hash_digestsize(desc.tfm);
+ req = ahash_request_alloc(tfm, GFP_KERNEL);
+ if (!req) {
+ crypto_free_ahash(tfm);
+ return GSS_S_FAILURE;
+ }
+
+ ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
+
+ checksumlen = crypto_ahash_digestsize(tfm);
if (cksumkey != NULL) {
- err = crypto_hash_setkey(desc.tfm, cksumkey,
- kctx->gk5e->keylength);
+ err = crypto_ahash_setkey(tfm, cksumkey,
+ kctx->gk5e->keylength);
if (err)
goto out;
}
- err = crypto_hash_init(&desc);
+ err = crypto_ahash_init(req);
if (err)
goto out;
sg_init_one(sg, header, hdrlen);
- err = crypto_hash_update(&desc, sg, hdrlen);
+ ahash_request_set_crypt(req, sg, NULL, hdrlen);
+ err = crypto_ahash_update(req);
if (err)
goto out;
err = xdr_process_buf(body, body_offset, body->len - body_offset,
- checksummer, &desc);
+ checksummer, req);
if (err)
goto out;
- err = crypto_hash_final(&desc, checksumdata);
+ ahash_request_set_crypt(req, NULL, checksumdata, 0);
+ err = crypto_ahash_final(req);
if (err)
goto out;
@@ -307,7 +348,8 @@ make_checksum(struct krb5_ctx *kctx, char *header, int hdrlen,
}
cksumout->len = kctx->gk5e->cksumlength;
out:
- crypto_free_hash(desc.tfm);
+ ahash_request_free(req);
+ crypto_free_ahash(tfm);
return err ? GSS_S_FAILURE : 0;
}
@@ -323,7 +365,8 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
struct xdr_buf *body, int body_offset, u8 *cksumkey,
unsigned int usage, struct xdr_netobj *cksumout)
{
- struct hash_desc desc;
+ struct crypto_ahash *tfm;
+ struct ahash_request *req;
struct scatterlist sg[1];
int err;
u8 checksumdata[GSS_KRB5_MAX_CKSUM_LEN];
@@ -340,31 +383,39 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
return GSS_S_FAILURE;
}
- desc.tfm = crypto_alloc_hash(kctx->gk5e->cksum_name, 0,
- CRYPTO_ALG_ASYNC);
- if (IS_ERR(desc.tfm))
+ tfm = crypto_alloc_ahash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+ if (IS_ERR(tfm))
return GSS_S_FAILURE;
- checksumlen = crypto_hash_digestsize(desc.tfm);
- desc.flags = CRYPTO_TFM_REQ_MAY_SLEEP;
+ checksumlen = crypto_ahash_digestsize(tfm);
+
+ req = ahash_request_alloc(tfm, GFP_KERNEL);
+ if (!req) {
+ crypto_free_ahash(tfm);
+ return GSS_S_FAILURE;
+ }
- err = crypto_hash_setkey(desc.tfm, cksumkey, kctx->gk5e->keylength);
+ ahash_request_set_callback(req, CRYPTO_TFM_REQ_MAY_SLEEP, NULL, NULL);
+
+ err = crypto_ahash_setkey(tfm, cksumkey, kctx->gk5e->keylength);
if (err)
goto out;
- err = crypto_hash_init(&desc);
+ err = crypto_ahash_init(req);
if (err)
goto out;
err = xdr_process_buf(body, body_offset, body->len - body_offset,
- checksummer, &desc);
+ checksummer, req);
if (err)
goto out;
if (header != NULL) {
sg_init_one(sg, header, hdrlen);
- err = crypto_hash_update(&desc, sg, hdrlen);
+ ahash_request_set_crypt(req, sg, NULL, hdrlen);
+ err = crypto_ahash_update(req);
if (err)
goto out;
}
- err = crypto_hash_final(&desc, checksumdata);
+ ahash_request_set_crypt(req, NULL, checksumdata, 0);
+ err = crypto_ahash_final(req);
if (err)
goto out;
@@ -381,13 +432,14 @@ make_checksum_v2(struct krb5_ctx *kctx, char *header, int hdrlen,
break;
}
out:
- crypto_free_hash(desc.tfm);
+ ahash_request_free(req);
+ crypto_free_ahash(tfm);
return err ? GSS_S_FAILURE : 0;
}
struct encryptor_desc {
u8 iv[GSS_KRB5_MAX_BLOCKSIZE];
- struct blkcipher_desc desc;
+ struct skcipher_request *req;
int pos;
struct xdr_buf *outbuf;
struct page **pages;
@@ -402,6 +454,7 @@ encryptor(struct scatterlist *sg, void *data)
{
struct encryptor_desc *desc = data;
struct xdr_buf *outbuf = desc->outbuf;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(desc->req);
struct page *in_page;
int thislen = desc->fraglen + sg->length;
int fraglen, ret;
@@ -414,7 +467,7 @@ encryptor(struct scatterlist *sg, void *data)
page_pos = desc->pos - outbuf->head[0].iov_len;
if (page_pos >= 0 && page_pos < outbuf->page_len) {
/* pages are not in place: */
- int i = (page_pos + outbuf->page_base) >> PAGE_CACHE_SHIFT;
+ int i = (page_pos + outbuf->page_base) >> PAGE_SHIFT;
in_page = desc->pages[i];
} else {
in_page = sg_page(sg);
@@ -427,7 +480,7 @@ encryptor(struct scatterlist *sg, void *data)
desc->fraglen += sg->length;
desc->pos += sg->length;
- fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1);
+ fraglen = thislen & (crypto_skcipher_blocksize(tfm) - 1);
thislen -= fraglen;
if (thislen == 0)
@@ -436,8 +489,10 @@ encryptor(struct scatterlist *sg, void *data)
sg_mark_end(&desc->infrags[desc->fragno - 1]);
sg_mark_end(&desc->outfrags[desc->fragno - 1]);
- ret = crypto_blkcipher_encrypt_iv(&desc->desc, desc->outfrags,
- desc->infrags, thislen);
+ skcipher_request_set_crypt(desc->req, desc->infrags, desc->outfrags,
+ thislen, desc->iv);
+
+ ret = crypto_skcipher_encrypt(desc->req);
if (ret)
return ret;
@@ -459,18 +514,20 @@ encryptor(struct scatterlist *sg, void *data)
}
int
-gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
+gss_encrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *buf,
int offset, struct page **pages)
{
int ret;
struct encryptor_desc desc;
+ SKCIPHER_REQUEST_ON_STACK(req, tfm);
+
+ BUG_ON((buf->len - offset) % crypto_skcipher_blocksize(tfm) != 0);
- BUG_ON((buf->len - offset) % crypto_blkcipher_blocksize(tfm) != 0);
+ skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
memset(desc.iv, 0, sizeof(desc.iv));
- desc.desc.tfm = tfm;
- desc.desc.info = desc.iv;
- desc.desc.flags = 0;
+ desc.req = req;
desc.pos = offset;
desc.outbuf = buf;
desc.pages = pages;
@@ -481,12 +538,13 @@ gss_encrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
sg_init_table(desc.outfrags, 4);
ret = xdr_process_buf(buf, offset, buf->len - offset, encryptor, &desc);
+ skcipher_request_zero(req);
return ret;
}
struct decryptor_desc {
u8 iv[GSS_KRB5_MAX_BLOCKSIZE];
- struct blkcipher_desc desc;
+ struct skcipher_request *req;
struct scatterlist frags[4];
int fragno;
int fraglen;
@@ -497,6 +555,7 @@ decryptor(struct scatterlist *sg, void *data)
{
struct decryptor_desc *desc = data;
int thislen = desc->fraglen + sg->length;
+ struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(desc->req);
int fraglen, ret;
/* Worst case is 4 fragments: head, end of page 1, start
@@ -507,7 +566,7 @@ decryptor(struct scatterlist *sg, void *data)
desc->fragno++;
desc->fraglen += sg->length;
- fraglen = thislen & (crypto_blkcipher_blocksize(desc->desc.tfm) - 1);
+ fraglen = thislen & (crypto_skcipher_blocksize(tfm) - 1);
thislen -= fraglen;
if (thislen == 0)
@@ -515,8 +574,10 @@ decryptor(struct scatterlist *sg, void *data)
sg_mark_end(&desc->frags[desc->fragno - 1]);
- ret = crypto_blkcipher_decrypt_iv(&desc->desc, desc->frags,
- desc->frags, thislen);
+ skcipher_request_set_crypt(desc->req, desc->frags, desc->frags,
+ thislen, desc->iv);
+
+ ret = crypto_skcipher_decrypt(desc->req);
if (ret)
return ret;
@@ -535,24 +596,29 @@ decryptor(struct scatterlist *sg, void *data)
}
int
-gss_decrypt_xdr_buf(struct crypto_blkcipher *tfm, struct xdr_buf *buf,
+gss_decrypt_xdr_buf(struct crypto_skcipher *tfm, struct xdr_buf *buf,
int offset)
{
+ int ret;
struct decryptor_desc desc;
+ SKCIPHER_REQUEST_ON_STACK(req, tfm);
/* XXXJBF: */
- BUG_ON((buf->len - offset) % crypto_blkcipher_blocksize(tfm) != 0);
+ BUG_ON((buf->len - offset) % crypto_skcipher_blocksize(tfm) != 0);
+
+ skcipher_request_set_tfm(req, tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
memset(desc.iv, 0, sizeof(desc.iv));
- desc.desc.tfm = tfm;
- desc.desc.info = desc.iv;
- desc.desc.flags = 0;
+ desc.req = req;
desc.fragno = 0;
desc.fraglen = 0;
sg_init_table(desc.frags, 4);
- return xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc);
+ ret = xdr_process_buf(buf, offset, buf->len - offset, decryptor, &desc);
+ skcipher_request_zero(req);
+ return ret;
}
/*
@@ -594,12 +660,12 @@ xdr_extend_head(struct xdr_buf *buf, unsigned int base, unsigned int shiftlen)
}
static u32
-gss_krb5_cts_crypt(struct crypto_blkcipher *cipher, struct xdr_buf *buf,
+gss_krb5_cts_crypt(struct crypto_skcipher *cipher, struct xdr_buf *buf,
u32 offset, u8 *iv, struct page **pages, int encrypt)
{
u32 ret;
struct scatterlist sg[1];
- struct blkcipher_desc desc = { .tfm = cipher, .info = iv };
+ SKCIPHER_REQUEST_ON_STACK(req, cipher);
u8 data[GSS_KRB5_MAX_BLOCKSIZE * 2];
struct page **save_pages;
u32 len = buf->len - offset;
@@ -625,10 +691,16 @@ gss_krb5_cts_crypt(struct crypto_blkcipher *cipher, struct xdr_buf *buf,
sg_init_one(sg, data, len);
+ skcipher_request_set_tfm(req, cipher);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, sg, sg, len, iv);
+
if (encrypt)
- ret = crypto_blkcipher_encrypt_iv(&desc, sg, sg, len);
+ ret = crypto_skcipher_encrypt(req);
else
- ret = crypto_blkcipher_decrypt_iv(&desc, sg, sg, len);
+ ret = crypto_skcipher_decrypt(req);
+
+ skcipher_request_zero(req);
if (ret)
goto out;
@@ -647,7 +719,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
struct xdr_netobj hmac;
u8 *cksumkey;
u8 *ecptr;
- struct crypto_blkcipher *cipher, *aux_cipher;
+ struct crypto_skcipher *cipher, *aux_cipher;
int blocksize;
struct page **save_pages;
int nblocks, nbytes;
@@ -666,7 +738,7 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
cksumkey = kctx->acceptor_integ;
usage = KG_USAGE_ACCEPTOR_SEAL;
}
- blocksize = crypto_blkcipher_blocksize(cipher);
+ blocksize = crypto_skcipher_blocksize(cipher);
/* hide the gss token header and insert the confounder */
offset += GSS_KRB5_TOK_HDR_LEN;
@@ -719,20 +791,24 @@ gss_krb5_aes_encrypt(struct krb5_ctx *kctx, u32 offset,
memset(desc.iv, 0, sizeof(desc.iv));
if (cbcbytes) {
+ SKCIPHER_REQUEST_ON_STACK(req, aux_cipher);
+
desc.pos = offset + GSS_KRB5_TOK_HDR_LEN;
desc.fragno = 0;
desc.fraglen = 0;
desc.pages = pages;
desc.outbuf = buf;
- desc.desc.info = desc.iv;
- desc.desc.flags = 0;
- desc.desc.tfm = aux_cipher;
+ desc.req = req;
+
+ skcipher_request_set_tfm(req, aux_cipher);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
sg_init_table(desc.infrags, 4);
sg_init_table(desc.outfrags, 4);
err = xdr_process_buf(buf, offset + GSS_KRB5_TOK_HDR_LEN,
cbcbytes, encryptor, &desc);
+ skcipher_request_zero(req);
if (err)
goto out_err;
}
@@ -763,7 +839,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,
struct xdr_buf subbuf;
u32 ret = 0;
u8 *cksum_key;
- struct crypto_blkcipher *cipher, *aux_cipher;
+ struct crypto_skcipher *cipher, *aux_cipher;
struct xdr_netobj our_hmac_obj;
u8 our_hmac[GSS_KRB5_MAX_CKSUM_LEN];
u8 pkt_hmac[GSS_KRB5_MAX_CKSUM_LEN];
@@ -782,7 +858,7 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,
cksum_key = kctx->initiator_integ;
usage = KG_USAGE_INITIATOR_SEAL;
}
- blocksize = crypto_blkcipher_blocksize(cipher);
+ blocksize = crypto_skcipher_blocksize(cipher);
/* create a segment skipping the header and leaving out the checksum */
@@ -799,15 +875,19 @@ gss_krb5_aes_decrypt(struct krb5_ctx *kctx, u32 offset, struct xdr_buf *buf,
memset(desc.iv, 0, sizeof(desc.iv));
if (cbcbytes) {
+ SKCIPHER_REQUEST_ON_STACK(req, aux_cipher);
+
desc.fragno = 0;
desc.fraglen = 0;
- desc.desc.info = desc.iv;
- desc.desc.flags = 0;
- desc.desc.tfm = aux_cipher;
+ desc.req = req;
+
+ skcipher_request_set_tfm(req, aux_cipher);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
sg_init_table(desc.frags, 4);
ret = xdr_process_buf(&subbuf, 0, cbcbytes, decryptor, &desc);
+ skcipher_request_zero(req);
if (ret)
goto out_err;
}
@@ -850,61 +930,63 @@ out_err:
* Set the key of the given cipher.
*/
int
-krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher,
+krb5_rc4_setup_seq_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
unsigned char *cksum)
{
- struct crypto_hash *hmac;
- struct hash_desc desc;
- struct scatterlist sg[1];
+ struct crypto_shash *hmac;
+ struct shash_desc *desc;
u8 Kseq[GSS_KRB5_MAX_KEYLEN];
u32 zeroconstant = 0;
int err;
dprintk("%s: entered\n", __func__);
- hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+ hmac = crypto_alloc_shash(kctx->gk5e->cksum_name, 0, 0);
if (IS_ERR(hmac)) {
dprintk("%s: error %ld, allocating hash '%s'\n",
__func__, PTR_ERR(hmac), kctx->gk5e->cksum_name);
return PTR_ERR(hmac);
}
- desc.tfm = hmac;
- desc.flags = 0;
+ desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
+ GFP_KERNEL);
+ if (!desc) {
+ dprintk("%s: failed to allocate shash descriptor for '%s'\n",
+ __func__, kctx->gk5e->cksum_name);
+ crypto_free_shash(hmac);
+ return -ENOMEM;
+ }
- err = crypto_hash_init(&desc);
- if (err)
- goto out_err;
+ desc->tfm = hmac;
+ desc->flags = 0;
/* Compute intermediate Kseq from session key */
- err = crypto_hash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength);
+ err = crypto_shash_setkey(hmac, kctx->Ksess, kctx->gk5e->keylength);
if (err)
goto out_err;
- sg_init_one(sg, &zeroconstant, 4);
- err = crypto_hash_digest(&desc, sg, 4, Kseq);
+ err = crypto_shash_digest(desc, (u8 *)&zeroconstant, 4, Kseq);
if (err)
goto out_err;
/* Compute final Kseq from the checksum and intermediate Kseq */
- err = crypto_hash_setkey(hmac, Kseq, kctx->gk5e->keylength);
+ err = crypto_shash_setkey(hmac, Kseq, kctx->gk5e->keylength);
if (err)
goto out_err;
- sg_set_buf(sg, cksum, 8);
-
- err = crypto_hash_digest(&desc, sg, 8, Kseq);
+ err = crypto_shash_digest(desc, cksum, 8, Kseq);
if (err)
goto out_err;
- err = crypto_blkcipher_setkey(cipher, Kseq, kctx->gk5e->keylength);
+ err = crypto_skcipher_setkey(cipher, Kseq, kctx->gk5e->keylength);
if (err)
goto out_err;
err = 0;
out_err:
- crypto_free_hash(hmac);
+ kzfree(desc);
+ crypto_free_shash(hmac);
dprintk("%s: returning %d\n", __func__, err);
return err;
}
@@ -914,12 +996,11 @@ out_err:
* Set the key of cipher kctx->enc.
*/
int
-krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher,
+krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_skcipher *cipher,
s32 seqnum)
{
- struct crypto_hash *hmac;
- struct hash_desc desc;
- struct scatterlist sg[1];
+ struct crypto_shash *hmac;
+ struct shash_desc *desc;
u8 Kcrypt[GSS_KRB5_MAX_KEYLEN];
u8 zeroconstant[4] = {0};
u8 seqnumarray[4];
@@ -927,35 +1008,39 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher,
dprintk("%s: entered, seqnum %u\n", __func__, seqnum);
- hmac = crypto_alloc_hash(kctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+ hmac = crypto_alloc_shash(kctx->gk5e->cksum_name, 0, 0);
if (IS_ERR(hmac)) {
dprintk("%s: error %ld, allocating hash '%s'\n",
__func__, PTR_ERR(hmac), kctx->gk5e->cksum_name);
return PTR_ERR(hmac);
}
- desc.tfm = hmac;
- desc.flags = 0;
+ desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
+ GFP_KERNEL);
+ if (!desc) {
+ dprintk("%s: failed to allocate shash descriptor for '%s'\n",
+ __func__, kctx->gk5e->cksum_name);
+ crypto_free_shash(hmac);
+ return -ENOMEM;
+ }
- err = crypto_hash_init(&desc);
- if (err)
- goto out_err;
+ desc->tfm = hmac;
+ desc->flags = 0;
/* Compute intermediate Kcrypt from session key */
for (i = 0; i < kctx->gk5e->keylength; i++)
Kcrypt[i] = kctx->Ksess[i] ^ 0xf0;
- err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
+ err = crypto_shash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
if (err)
goto out_err;
- sg_init_one(sg, zeroconstant, 4);
- err = crypto_hash_digest(&desc, sg, 4, Kcrypt);
+ err = crypto_shash_digest(desc, zeroconstant, 4, Kcrypt);
if (err)
goto out_err;
/* Compute final Kcrypt from the seqnum and intermediate Kcrypt */
- err = crypto_hash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
+ err = crypto_shash_setkey(hmac, Kcrypt, kctx->gk5e->keylength);
if (err)
goto out_err;
@@ -964,20 +1049,19 @@ krb5_rc4_setup_enc_key(struct krb5_ctx *kctx, struct crypto_blkcipher *cipher,
seqnumarray[2] = (unsigned char) ((seqnum >> 8) & 0xff);
seqnumarray[3] = (unsigned char) ((seqnum >> 0) & 0xff);
- sg_set_buf(sg, seqnumarray, 4);
-
- err = crypto_hash_digest(&desc, sg, 4, Kcrypt);
+ err = crypto_shash_digest(desc, seqnumarray, 4, Kcrypt);
if (err)
goto out_err;
- err = crypto_blkcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength);
+ err = crypto_skcipher_setkey(cipher, Kcrypt, kctx->gk5e->keylength);
if (err)
goto out_err;
err = 0;
out_err:
- crypto_free_hash(hmac);
+ kzfree(desc);
+ crypto_free_shash(hmac);
dprintk("%s: returning %d\n", __func__, err);
return err;
}
diff --git a/net/sunrpc/auth_gss/gss_krb5_keys.c b/net/sunrpc/auth_gss/gss_krb5_keys.c
index 234fa8d0fd9b..870133146026 100644
--- a/net/sunrpc/auth_gss/gss_krb5_keys.c
+++ b/net/sunrpc/auth_gss/gss_krb5_keys.c
@@ -54,9 +54,9 @@
* WARRANTIES OF MERCHANTIBILITY AND FITNESS FOR A PARTICULAR PURPOSE.
*/
+#include <crypto/skcipher.h>
#include <linux/err.h>
#include <linux/types.h>
-#include <linux/crypto.h>
#include <linux/sunrpc/gss_krb5.h>
#include <linux/sunrpc/xdr.h>
#include <linux/lcm.h>
@@ -147,7 +147,7 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e,
size_t blocksize, keybytes, keylength, n;
unsigned char *inblockdata, *outblockdata, *rawkey;
struct xdr_netobj inblock, outblock;
- struct crypto_blkcipher *cipher;
+ struct crypto_skcipher *cipher;
u32 ret = EINVAL;
blocksize = gk5e->blocksize;
@@ -157,11 +157,11 @@ u32 krb5_derive_key(const struct gss_krb5_enctype *gk5e,
if ((inkey->len != keylength) || (outkey->len != keylength))
goto err_return;
- cipher = crypto_alloc_blkcipher(gk5e->encrypt_name, 0,
- CRYPTO_ALG_ASYNC);
+ cipher = crypto_alloc_skcipher(gk5e->encrypt_name, 0,
+ CRYPTO_ALG_ASYNC);
if (IS_ERR(cipher))
goto err_return;
- if (crypto_blkcipher_setkey(cipher, inkey->data, inkey->len))
+ if (crypto_skcipher_setkey(cipher, inkey->data, inkey->len))
goto err_return;
/* allocate and set up buffers */
@@ -238,7 +238,7 @@ err_free_in:
memset(inblockdata, 0, blocksize);
kfree(inblockdata);
err_free_cipher:
- crypto_free_blkcipher(cipher);
+ crypto_free_skcipher(cipher);
err_return:
return ret;
}
diff --git a/net/sunrpc/auth_gss/gss_krb5_mech.c b/net/sunrpc/auth_gss/gss_krb5_mech.c
index 28db442a0034..65427492b1c9 100644
--- a/net/sunrpc/auth_gss/gss_krb5_mech.c
+++ b/net/sunrpc/auth_gss/gss_krb5_mech.c
@@ -34,6 +34,8 @@
*
*/
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
#include <linux/err.h>
#include <linux/module.h>
#include <linux/init.h>
@@ -42,7 +44,6 @@
#include <linux/sunrpc/auth.h>
#include <linux/sunrpc/gss_krb5.h>
#include <linux/sunrpc/xdr.h>
-#include <linux/crypto.h>
#include <linux/sunrpc/gss_krb5_enctypes.h>
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
@@ -217,7 +218,7 @@ simple_get_netobj(const void *p, const void *end, struct xdr_netobj *res)
static inline const void *
get_key(const void *p, const void *end,
- struct krb5_ctx *ctx, struct crypto_blkcipher **res)
+ struct krb5_ctx *ctx, struct crypto_skcipher **res)
{
struct xdr_netobj key;
int alg;
@@ -245,7 +246,7 @@ get_key(const void *p, const void *end,
if (IS_ERR(p))
goto out_err;
- *res = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0,
+ *res = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0,
CRYPTO_ALG_ASYNC);
if (IS_ERR(*res)) {
printk(KERN_WARNING "gss_kerberos_mech: unable to initialize "
@@ -253,7 +254,7 @@ get_key(const void *p, const void *end,
*res = NULL;
goto out_err_free_key;
}
- if (crypto_blkcipher_setkey(*res, key.data, key.len)) {
+ if (crypto_skcipher_setkey(*res, key.data, key.len)) {
printk(KERN_WARNING "gss_kerberos_mech: error setting key for "
"crypto algorithm %s\n", ctx->gk5e->encrypt_name);
goto out_err_free_tfm;
@@ -263,7 +264,7 @@ get_key(const void *p, const void *end,
return p;
out_err_free_tfm:
- crypto_free_blkcipher(*res);
+ crypto_free_skcipher(*res);
out_err_free_key:
kfree(key.data);
p = ERR_PTR(-EINVAL);
@@ -335,30 +336,30 @@ gss_import_v1_context(const void *p, const void *end, struct krb5_ctx *ctx)
return 0;
out_err_free_key2:
- crypto_free_blkcipher(ctx->seq);
+ crypto_free_skcipher(ctx->seq);
out_err_free_key1:
- crypto_free_blkcipher(ctx->enc);
+ crypto_free_skcipher(ctx->enc);
out_err_free_mech:
kfree(ctx->mech_used.data);
out_err:
return PTR_ERR(p);
}
-static struct crypto_blkcipher *
+static struct crypto_skcipher *
context_v2_alloc_cipher(struct krb5_ctx *ctx, const char *cname, u8 *key)
{
- struct crypto_blkcipher *cp;
+ struct crypto_skcipher *cp;
- cp = crypto_alloc_blkcipher(cname, 0, CRYPTO_ALG_ASYNC);
+ cp = crypto_alloc_skcipher(cname, 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(cp)) {
dprintk("gss_kerberos_mech: unable to initialize "
"crypto algorithm %s\n", cname);
return NULL;
}
- if (crypto_blkcipher_setkey(cp, key, ctx->gk5e->keylength)) {
+ if (crypto_skcipher_setkey(cp, key, ctx->gk5e->keylength)) {
dprintk("gss_kerberos_mech: error setting key for "
"crypto algorithm %s\n", cname);
- crypto_free_blkcipher(cp);
+ crypto_free_skcipher(cp);
return NULL;
}
return cp;
@@ -412,9 +413,9 @@ context_derive_keys_des3(struct krb5_ctx *ctx, gfp_t gfp_mask)
return 0;
out_free_enc:
- crypto_free_blkcipher(ctx->enc);
+ crypto_free_skcipher(ctx->enc);
out_free_seq:
- crypto_free_blkcipher(ctx->seq);
+ crypto_free_skcipher(ctx->seq);
out_err:
return -EINVAL;
}
@@ -427,18 +428,17 @@ out_err:
static int
context_derive_keys_rc4(struct krb5_ctx *ctx)
{
- struct crypto_hash *hmac;
+ struct crypto_shash *hmac;
char sigkeyconstant[] = "signaturekey";
int slen = strlen(sigkeyconstant) + 1; /* include null terminator */
- struct hash_desc desc;
- struct scatterlist sg[1];
+ struct shash_desc *desc;
int err;
dprintk("RPC: %s: entered\n", __func__);
/*
* derive cksum (aka Ksign) key
*/
- hmac = crypto_alloc_hash(ctx->gk5e->cksum_name, 0, CRYPTO_ALG_ASYNC);
+ hmac = crypto_alloc_shash(ctx->gk5e->cksum_name, 0, 0);
if (IS_ERR(hmac)) {
dprintk("%s: error %ld allocating hash '%s'\n",
__func__, PTR_ERR(hmac), ctx->gk5e->cksum_name);
@@ -446,37 +446,41 @@ context_derive_keys_rc4(struct krb5_ctx *ctx)
goto out_err;
}
- err = crypto_hash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength);
+ err = crypto_shash_setkey(hmac, ctx->Ksess, ctx->gk5e->keylength);
if (err)
goto out_err_free_hmac;
- sg_init_table(sg, 1);
- sg_set_buf(sg, sigkeyconstant, slen);
- desc.tfm = hmac;
- desc.flags = 0;
-
- err = crypto_hash_init(&desc);
- if (err)
+ desc = kmalloc(sizeof(*desc) + crypto_shash_descsize(hmac),
+ GFP_KERNEL);
+ if (!desc) {
+ dprintk("%s: failed to allocate hash descriptor for '%s'\n",
+ __func__, ctx->gk5e->cksum_name);
+ err = -ENOMEM;
goto out_err_free_hmac;
+ }
+
+ desc->tfm = hmac;
+ desc->flags = 0;
- err = crypto_hash_digest(&desc, sg, slen, ctx->cksum);
+ err = crypto_shash_digest(desc, sigkeyconstant, slen, ctx->cksum);
+ kzfree(desc);
if (err)
goto out_err_free_hmac;
/*
- * allocate hash, and blkciphers for data and seqnum encryption
+ * allocate hash, and skciphers for data and seqnum encryption
*/
- ctx->enc = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0,
- CRYPTO_ALG_ASYNC);
+ ctx->enc = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0,
+ CRYPTO_ALG_ASYNC);
if (IS_ERR(ctx->enc)) {
err = PTR_ERR(ctx->enc);
goto out_err_free_hmac;
}
- ctx->seq = crypto_alloc_blkcipher(ctx->gk5e->encrypt_name, 0,
- CRYPTO_ALG_ASYNC);
+ ctx->seq = crypto_alloc_skcipher(ctx->gk5e->encrypt_name, 0,
+ CRYPTO_ALG_ASYNC);
if (IS_ERR(ctx->seq)) {
- crypto_free_blkcipher(ctx->enc);
+ crypto_free_skcipher(ctx->enc);
err = PTR_ERR(ctx->seq);
goto out_err_free_hmac;
}
@@ -486,7 +490,7 @@ context_derive_keys_rc4(struct krb5_ctx *ctx)
err = 0;
out_err_free_hmac:
- crypto_free_hash(hmac);
+ crypto_free_shash(hmac);
out_err:
dprintk("RPC: %s: returning %d\n", __func__, err);
return err;
@@ -588,7 +592,7 @@ context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask)
context_v2_alloc_cipher(ctx, "cbc(aes)",
ctx->acceptor_seal);
if (ctx->acceptor_enc_aux == NULL) {
- crypto_free_blkcipher(ctx->initiator_enc_aux);
+ crypto_free_skcipher(ctx->initiator_enc_aux);
goto out_free_acceptor_enc;
}
}
@@ -596,9 +600,9 @@ context_derive_keys_new(struct krb5_ctx *ctx, gfp_t gfp_mask)
return 0;
out_free_acceptor_enc:
- crypto_free_blkcipher(ctx->acceptor_enc);
+ crypto_free_skcipher(ctx->acceptor_enc);
out_free_initiator_enc:
- crypto_free_blkcipher(ctx->initiator_enc);
+ crypto_free_skcipher(ctx->initiator_enc);
out_err:
return -EINVAL;
}
@@ -710,12 +714,12 @@ static void
gss_delete_sec_context_kerberos(void *internal_ctx) {
struct krb5_ctx *kctx = internal_ctx;
- crypto_free_blkcipher(kctx->seq);
- crypto_free_blkcipher(kctx->enc);
- crypto_free_blkcipher(kctx->acceptor_enc);
- crypto_free_blkcipher(kctx->initiator_enc);
- crypto_free_blkcipher(kctx->acceptor_enc_aux);
- crypto_free_blkcipher(kctx->initiator_enc_aux);
+ crypto_free_skcipher(kctx->seq);
+ crypto_free_skcipher(kctx->enc);
+ crypto_free_skcipher(kctx->acceptor_enc);
+ crypto_free_skcipher(kctx->initiator_enc);
+ crypto_free_skcipher(kctx->acceptor_enc_aux);
+ crypto_free_skcipher(kctx->initiator_enc_aux);
kfree(kctx->mech_used.data);
kfree(kctx);
}
diff --git a/net/sunrpc/auth_gss/gss_krb5_seqnum.c b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
index 20d55c793eb6..c8b9082f4a9d 100644
--- a/net/sunrpc/auth_gss/gss_krb5_seqnum.c
+++ b/net/sunrpc/auth_gss/gss_krb5_seqnum.c
@@ -31,9 +31,9 @@
* PERFORMANCE OF THIS SOFTWARE.
*/
+#include <crypto/skcipher.h>
#include <linux/types.h>
#include <linux/sunrpc/gss_krb5.h>
-#include <linux/crypto.h>
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_AUTH
@@ -43,13 +43,13 @@ static s32
krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum,
unsigned char *cksum, unsigned char *buf)
{
- struct crypto_blkcipher *cipher;
+ struct crypto_skcipher *cipher;
unsigned char plain[8];
s32 code;
dprintk("RPC: %s:\n", __func__);
- cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
- CRYPTO_ALG_ASYNC);
+ cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0,
+ CRYPTO_ALG_ASYNC);
if (IS_ERR(cipher))
return PTR_ERR(cipher);
@@ -68,12 +68,12 @@ krb5_make_rc4_seq_num(struct krb5_ctx *kctx, int direction, s32 seqnum,
code = krb5_encrypt(cipher, cksum, plain, buf, 8);
out:
- crypto_free_blkcipher(cipher);
+ crypto_free_skcipher(cipher);
return code;
}
s32
krb5_make_seq_num(struct krb5_ctx *kctx,
- struct crypto_blkcipher *key,
+ struct crypto_skcipher *key,
int direction,
u32 seqnum,
unsigned char *cksum, unsigned char *buf)
@@ -101,13 +101,13 @@ static s32
krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum,
unsigned char *buf, int *direction, s32 *seqnum)
{
- struct crypto_blkcipher *cipher;
+ struct crypto_skcipher *cipher;
unsigned char plain[8];
s32 code;
dprintk("RPC: %s:\n", __func__);
- cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
- CRYPTO_ALG_ASYNC);
+ cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0,
+ CRYPTO_ALG_ASYNC);
if (IS_ERR(cipher))
return PTR_ERR(cipher);
@@ -130,7 +130,7 @@ krb5_get_rc4_seq_num(struct krb5_ctx *kctx, unsigned char *cksum,
*seqnum = ((plain[0] << 24) | (plain[1] << 16) |
(plain[2] << 8) | (plain[3]));
out:
- crypto_free_blkcipher(cipher);
+ crypto_free_skcipher(cipher);
return code;
}
@@ -142,7 +142,7 @@ krb5_get_seq_num(struct krb5_ctx *kctx,
{
s32 code;
unsigned char plain[8];
- struct crypto_blkcipher *key = kctx->seq;
+ struct crypto_skcipher *key = kctx->seq;
dprintk("RPC: krb5_get_seq_num:\n");
diff --git a/net/sunrpc/auth_gss/gss_krb5_wrap.c b/net/sunrpc/auth_gss/gss_krb5_wrap.c
index ca7e92a32f84..a737c2da0837 100644
--- a/net/sunrpc/auth_gss/gss_krb5_wrap.c
+++ b/net/sunrpc/auth_gss/gss_krb5_wrap.c
@@ -28,12 +28,12 @@
* SUCH DAMAGES.
*/
+#include <crypto/skcipher.h>
#include <linux/types.h>
#include <linux/jiffies.h>
#include <linux/sunrpc/gss_krb5.h>
#include <linux/random.h>
#include <linux/pagemap.h>
-#include <linux/crypto.h>
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_AUTH
@@ -79,9 +79,9 @@ gss_krb5_remove_padding(struct xdr_buf *buf, int blocksize)
len -= buf->head[0].iov_len;
if (len <= buf->page_len) {
unsigned int last = (buf->page_base + len - 1)
- >>PAGE_CACHE_SHIFT;
+ >>PAGE_SHIFT;
unsigned int offset = (buf->page_base + len - 1)
- & (PAGE_CACHE_SIZE - 1);
+ & (PAGE_SIZE - 1);
ptr = kmap_atomic(buf->pages[last]);
pad = *(ptr + offset);
kunmap_atomic(ptr);
@@ -174,7 +174,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
now = get_seconds();
- blocksize = crypto_blkcipher_blocksize(kctx->enc);
+ blocksize = crypto_skcipher_blocksize(kctx->enc);
gss_krb5_add_padding(buf, offset, blocksize);
BUG_ON((buf->len - offset) % blocksize);
plainlen = conflen + buf->len - offset;
@@ -239,10 +239,10 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
return GSS_S_FAILURE;
if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
- struct crypto_blkcipher *cipher;
+ struct crypto_skcipher *cipher;
int err;
- cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
- CRYPTO_ALG_ASYNC);
+ cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0,
+ CRYPTO_ALG_ASYNC);
if (IS_ERR(cipher))
return GSS_S_FAILURE;
@@ -250,7 +250,7 @@ gss_wrap_kerberos_v1(struct krb5_ctx *kctx, int offset,
err = gss_encrypt_xdr_buf(cipher, buf,
offset + headlen - conflen, pages);
- crypto_free_blkcipher(cipher);
+ crypto_free_skcipher(cipher);
if (err)
return GSS_S_FAILURE;
} else {
@@ -327,18 +327,18 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
return GSS_S_BAD_SIG;
if (kctx->enctype == ENCTYPE_ARCFOUR_HMAC) {
- struct crypto_blkcipher *cipher;
+ struct crypto_skcipher *cipher;
int err;
- cipher = crypto_alloc_blkcipher(kctx->gk5e->encrypt_name, 0,
- CRYPTO_ALG_ASYNC);
+ cipher = crypto_alloc_skcipher(kctx->gk5e->encrypt_name, 0,
+ CRYPTO_ALG_ASYNC);
if (IS_ERR(cipher))
return GSS_S_FAILURE;
krb5_rc4_setup_enc_key(kctx, cipher, seqnum);
err = gss_decrypt_xdr_buf(cipher, buf, crypt_offset);
- crypto_free_blkcipher(cipher);
+ crypto_free_skcipher(cipher);
if (err)
return GSS_S_DEFECTIVE_TOKEN;
} else {
@@ -371,7 +371,7 @@ gss_unwrap_kerberos_v1(struct krb5_ctx *kctx, int offset, struct xdr_buf *buf)
/* Copy the data back to the right position. XXX: Would probably be
* better to copy and encrypt at the same time. */
- blocksize = crypto_blkcipher_blocksize(kctx->enc);
+ blocksize = crypto_skcipher_blocksize(kctx->enc);
data_start = ptr + (GSS_KRB5_TOK_HDR_LEN + kctx->gk5e->cksumlength) +
conflen;
orig_start = buf->head[0].iov_base + offset;
@@ -473,7 +473,7 @@ gss_wrap_kerberos_v2(struct krb5_ctx *kctx, u32 offset,
*ptr++ = 0xff;
be16ptr = (__be16 *)ptr;
- blocksize = crypto_blkcipher_blocksize(kctx->acceptor_enc);
+ blocksize = crypto_skcipher_blocksize(kctx->acceptor_enc);
*be16ptr++ = 0;
/* "inner" token header always uses 0 for RRC */
*be16ptr++ = 0;
diff --git a/net/sunrpc/auth_gss/gss_rpc_upcall.c b/net/sunrpc/auth_gss/gss_rpc_upcall.c
index 59eeed43eda2..f0c6a8c78a56 100644
--- a/net/sunrpc/auth_gss/gss_rpc_upcall.c
+++ b/net/sunrpc/auth_gss/gss_rpc_upcall.c
@@ -326,6 +326,9 @@ int gssp_accept_sec_context_upcall(struct net *net,
if (data->found_creds && client_name.data != NULL) {
char *c;
+ data->creds.cr_raw_principal = kstrndup(client_name.data,
+ client_name.len, GFP_KERNEL);
+
data->creds.cr_principal = kstrndup(client_name.data,
client_name.len, GFP_KERNEL);
if (data->creds.cr_principal) {
diff --git a/net/sunrpc/auth_null.c b/net/sunrpc/auth_null.c
index c2a2b584a056..8d9eb4d5ddd8 100644
--- a/net/sunrpc/auth_null.c
+++ b/net/sunrpc/auth_null.c
@@ -113,8 +113,8 @@ const struct rpc_authops authnull_ops = {
static
struct rpc_auth null_auth = {
- .au_cslack = 4,
- .au_rslack = 2,
+ .au_cslack = NUL_CALLSLACK,
+ .au_rslack = NUL_REPLYSLACK,
.au_ops = &authnull_ops,
.au_flavor = RPC_AUTH_NULL,
.au_count = ATOMIC_INIT(0),
diff --git a/net/sunrpc/auth_unix.c b/net/sunrpc/auth_unix.c
index 548240dd15fc..0d3dd364c22f 100644
--- a/net/sunrpc/auth_unix.c
+++ b/net/sunrpc/auth_unix.c
@@ -23,8 +23,6 @@ struct unx_cred {
};
#define uc_uid uc_base.cr_uid
-#define UNX_WRITESLACK (21 + XDR_QUADLEN(UNX_MAXNODENAME))
-
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
# define RPCDBG_FACILITY RPCDBG_AUTH
#endif
@@ -228,8 +226,8 @@ const struct rpc_authops authunix_ops = {
static
struct rpc_auth unix_auth = {
- .au_cslack = UNX_WRITESLACK,
- .au_rslack = 2, /* assume AUTH_NULL verf */
+ .au_cslack = UNX_CALLSLACK,
+ .au_rslack = NUL_REPLYSLACK,
.au_ops = &authunix_ops,
.au_flavor = RPC_AUTH_UNIX,
.au_count = ATOMIC_INIT(0),
diff --git a/net/sunrpc/backchannel_rqst.c b/net/sunrpc/backchannel_rqst.c
index 6255d141133b..229956bf8457 100644
--- a/net/sunrpc/backchannel_rqst.c
+++ b/net/sunrpc/backchannel_rqst.c
@@ -138,6 +138,14 @@ out_free:
*/
int xprt_setup_backchannel(struct rpc_xprt *xprt, unsigned int min_reqs)
{
+ if (!xprt->ops->bc_setup)
+ return 0;
+ return xprt->ops->bc_setup(xprt, min_reqs);
+}
+EXPORT_SYMBOL_GPL(xprt_setup_backchannel);
+
+int xprt_setup_bc(struct rpc_xprt *xprt, unsigned int min_reqs)
+{
struct rpc_rqst *req;
struct list_head tmp_list;
int i;
@@ -192,7 +200,6 @@ out_free:
dprintk("RPC: setup backchannel transport failed\n");
return -ENOMEM;
}
-EXPORT_SYMBOL_GPL(xprt_setup_backchannel);
/**
* xprt_destroy_backchannel - Destroys the backchannel preallocated structures.
@@ -205,6 +212,13 @@ EXPORT_SYMBOL_GPL(xprt_setup_backchannel);
*/
void xprt_destroy_backchannel(struct rpc_xprt *xprt, unsigned int max_reqs)
{
+ if (xprt->ops->bc_destroy)
+ xprt->ops->bc_destroy(xprt, max_reqs);
+}
+EXPORT_SYMBOL_GPL(xprt_destroy_backchannel);
+
+void xprt_destroy_bc(struct rpc_xprt *xprt, unsigned int max_reqs)
+{
struct rpc_rqst *req = NULL, *tmp = NULL;
dprintk("RPC: destroy backchannel transport\n");
@@ -227,7 +241,6 @@ out:
dprintk("RPC: backchannel list empty= %s\n",
list_empty(&xprt->bc_pa_list) ? "true" : "false");
}
-EXPORT_SYMBOL_GPL(xprt_destroy_backchannel);
static struct rpc_rqst *xprt_alloc_bc_request(struct rpc_xprt *xprt, __be32 xid)
{
@@ -264,6 +277,13 @@ void xprt_free_bc_request(struct rpc_rqst *req)
{
struct rpc_xprt *xprt = req->rq_xprt;
+ xprt->ops->bc_free_rqst(req);
+}
+
+void xprt_free_bc_rqst(struct rpc_rqst *req)
+{
+ struct rpc_xprt *xprt = req->rq_xprt;
+
dprintk("RPC: free backchannel req=%p\n", req);
req->rq_connect_cookie = xprt->connect_cookie - 1;
diff --git a/net/sunrpc/cache.c b/net/sunrpc/cache.c
index 4a2340a54401..553bf95f7003 100644
--- a/net/sunrpc/cache.c
+++ b/net/sunrpc/cache.c
@@ -41,13 +41,16 @@
static bool cache_defer_req(struct cache_req *req, struct cache_head *item);
static void cache_revisit_request(struct cache_head *item);
-static void cache_init(struct cache_head *h)
+static void cache_init(struct cache_head *h, struct cache_detail *detail)
{
time_t now = seconds_since_boot();
INIT_HLIST_NODE(&h->cache_list);
h->flags = 0;
kref_init(&h->ref);
h->expiry_time = now + CACHE_NEW_EXPIRY;
+ if (now <= detail->flush_time)
+ /* ensure it isn't already expired */
+ now = detail->flush_time + 1;
h->last_refresh = now;
}
@@ -81,7 +84,7 @@ struct cache_head *sunrpc_cache_lookup(struct cache_detail *detail,
* we might get lose if we need to
* cache_put it soon.
*/
- cache_init(new);
+ cache_init(new, detail);
detail->init(new, key);
write_lock(&detail->hash_lock);
@@ -116,10 +119,15 @@ EXPORT_SYMBOL_GPL(sunrpc_cache_lookup);
static void cache_dequeue(struct cache_detail *detail, struct cache_head *ch);
-static void cache_fresh_locked(struct cache_head *head, time_t expiry)
+static void cache_fresh_locked(struct cache_head *head, time_t expiry,
+ struct cache_detail *detail)
{
+ time_t now = seconds_since_boot();
+ if (now <= detail->flush_time)
+ /* ensure it isn't immediately treated as expired */
+ now = detail->flush_time + 1;
head->expiry_time = expiry;
- head->last_refresh = seconds_since_boot();
+ head->last_refresh = now;
smp_wmb(); /* paired with smp_rmb() in cache_is_valid() */
set_bit(CACHE_VALID, &head->flags);
}
@@ -149,7 +157,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
set_bit(CACHE_NEGATIVE, &old->flags);
else
detail->update(old, new);
- cache_fresh_locked(old, new->expiry_time);
+ cache_fresh_locked(old, new->expiry_time, detail);
write_unlock(&detail->hash_lock);
cache_fresh_unlocked(old, detail);
return old;
@@ -162,7 +170,7 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
cache_put(old, detail);
return NULL;
}
- cache_init(tmp);
+ cache_init(tmp, detail);
detail->init(tmp, old);
write_lock(&detail->hash_lock);
@@ -173,8 +181,8 @@ struct cache_head *sunrpc_cache_update(struct cache_detail *detail,
hlist_add_head(&tmp->cache_list, &detail->hash_table[hash]);
detail->entries++;
cache_get(tmp);
- cache_fresh_locked(tmp, new->expiry_time);
- cache_fresh_locked(old, 0);
+ cache_fresh_locked(tmp, new->expiry_time, detail);
+ cache_fresh_locked(old, 0, detail);
write_unlock(&detail->hash_lock);
cache_fresh_unlocked(tmp, detail);
cache_fresh_unlocked(old, detail);
@@ -219,7 +227,8 @@ static int try_to_negate_entry(struct cache_detail *detail, struct cache_head *h
rv = cache_is_valid(h);
if (rv == -EAGAIN) {
set_bit(CACHE_NEGATIVE, &h->flags);
- cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY);
+ cache_fresh_locked(h, seconds_since_boot()+CACHE_NEW_EXPIRY,
+ detail);
rv = -ENOENT;
}
write_unlock(&detail->hash_lock);
@@ -487,10 +496,13 @@ EXPORT_SYMBOL_GPL(cache_flush);
void cache_purge(struct cache_detail *detail)
{
- detail->flush_time = LONG_MAX;
+ time_t now = seconds_since_boot();
+ if (detail->flush_time >= now)
+ now = detail->flush_time + 1;
+ /* 'now' is the maximum value any 'last_refresh' can have */
+ detail->flush_time = now;
detail->nextcheck = seconds_since_boot();
cache_flush();
- detail->flush_time = 1;
}
EXPORT_SYMBOL_GPL(cache_purge);
@@ -759,7 +771,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
if (count == 0)
return 0;
- mutex_lock(&inode->i_mutex); /* protect against multiple concurrent
+ inode_lock(inode); /* protect against multiple concurrent
* readers on this file */
again:
spin_lock(&queue_lock);
@@ -772,7 +784,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
}
if (rp->q.list.next == &cd->queue) {
spin_unlock(&queue_lock);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
WARN_ON_ONCE(rp->offset);
return 0;
}
@@ -826,7 +838,7 @@ static ssize_t cache_read(struct file *filp, char __user *buf, size_t count,
}
if (err == -EAGAIN)
goto again;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return err ? err : count;
}
@@ -869,7 +881,7 @@ static ssize_t cache_downcall(struct address_space *mapping,
char *kaddr;
ssize_t ret = -ENOMEM;
- if (count >= PAGE_CACHE_SIZE)
+ if (count >= PAGE_SIZE)
goto out_slow;
page = find_or_create_page(mapping, 0, GFP_KERNEL);
@@ -880,7 +892,7 @@ static ssize_t cache_downcall(struct address_space *mapping,
ret = cache_do_downcall(kaddr, buf, count, cd);
kunmap(page);
unlock_page(page);
- page_cache_release(page);
+ put_page(page);
return ret;
out_slow:
return cache_slow_downcall(buf, count, cd);
@@ -897,9 +909,9 @@ static ssize_t cache_write(struct file *filp, const char __user *buf,
if (!cd->cache_parse)
goto out;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
ret = cache_downcall(mapping, buf, count, cd);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
out:
return ret;
}
@@ -1170,14 +1182,14 @@ int sunrpc_cache_pipe_upcall(struct cache_detail *detail, struct cache_head *h)
}
crq->q.reader = 0;
- crq->item = cache_get(h);
crq->buf = buf;
crq->len = 0;
crq->readers = 0;
spin_lock(&queue_lock);
- if (test_bit(CACHE_PENDING, &h->flags))
+ if (test_bit(CACHE_PENDING, &h->flags)) {
+ crq->item = cache_get(h);
list_add_tail(&crq->q.list, &detail->queue);
- else
+ } else
/* Lost a race, no longer PENDING, so don't enqueue */
ret = -EAGAIN;
spin_unlock(&queue_lock);
@@ -1213,7 +1225,7 @@ int qword_get(char **bpp, char *dest, int bufsize)
if (bp[0] == '\\' && bp[1] == 'x') {
/* HEX STRING */
bp += 2;
- while (len < bufsize) {
+ while (len < bufsize - 1) {
int h, l;
h = hex_to_bin(bp[0]);
@@ -1436,6 +1448,7 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
{
char tbuf[20];
char *bp, *ep;
+ time_t then, now;
if (*ppos || count > sizeof(tbuf)-1)
return -EINVAL;
@@ -1447,8 +1460,22 @@ static ssize_t write_flush(struct file *file, const char __user *buf,
return -EINVAL;
bp = tbuf;
- cd->flush_time = get_expiry(&bp);
- cd->nextcheck = seconds_since_boot();
+ then = get_expiry(&bp);
+ now = seconds_since_boot();
+ cd->nextcheck = now;
+ /* Can only set flush_time to 1 second beyond "now", or
+ * possibly 1 second beyond flushtime. This is because
+ * flush_time never goes backwards so it mustn't get too far
+ * ahead of time.
+ */
+ if (then >= now) {
+ /* Want to flush everything, so behave like cache_purge() */
+ if (cd->flush_time >= now)
+ now = cd->flush_time + 1;
+ then = now;
+ }
+
+ cd->flush_time = then;
cache_flush();
*ppos += count;
diff --git a/net/sunrpc/clnt.c b/net/sunrpc/clnt.c
index 23608eb0ded2..7e0c9bf22df8 100644
--- a/net/sunrpc/clnt.c
+++ b/net/sunrpc/clnt.c
@@ -354,6 +354,7 @@ static void rpc_free_clid(struct rpc_clnt *clnt)
}
static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
+ struct rpc_xprt_switch *xps,
struct rpc_xprt *xprt,
struct rpc_clnt *parent)
{
@@ -411,6 +412,8 @@ static struct rpc_clnt * rpc_new_client(const struct rpc_create_args *args,
}
rpc_clnt_set_transport(clnt, xprt, timeout);
+ xprt_iter_init(&clnt->cl_xpi, xps);
+ xprt_switch_put(xps);
clnt->cl_rtt = &clnt->cl_rtt_default;
rpc_init_rtt(&clnt->cl_rtt_default, clnt->cl_timeout->to_initval);
@@ -438,6 +441,7 @@ out_no_clid:
out_err:
rpciod_down();
out_no_rpciod:
+ xprt_switch_put(xps);
xprt_put(xprt);
return ERR_PTR(err);
}
@@ -446,8 +450,13 @@ struct rpc_clnt *rpc_create_xprt(struct rpc_create_args *args,
struct rpc_xprt *xprt)
{
struct rpc_clnt *clnt = NULL;
+ struct rpc_xprt_switch *xps;
- clnt = rpc_new_client(args, xprt, NULL);
+ xps = xprt_switch_alloc(xprt, GFP_KERNEL);
+ if (xps == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ clnt = rpc_new_client(args, xps, xprt, NULL);
if (IS_ERR(clnt))
return clnt;
@@ -564,6 +573,7 @@ EXPORT_SYMBOL_GPL(rpc_create);
static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args,
struct rpc_clnt *clnt)
{
+ struct rpc_xprt_switch *xps;
struct rpc_xprt *xprt;
struct rpc_clnt *new;
int err;
@@ -571,13 +581,17 @@ static struct rpc_clnt *__rpc_clone_client(struct rpc_create_args *args,
err = -ENOMEM;
rcu_read_lock();
xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
+ xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
rcu_read_unlock();
- if (xprt == NULL)
+ if (xprt == NULL || xps == NULL) {
+ xprt_put(xprt);
+ xprt_switch_put(xps);
goto out_err;
+ }
args->servername = xprt->servername;
args->nodename = clnt->cl_nodename;
- new = rpc_new_client(args, xprt, clnt);
+ new = rpc_new_client(args, xps, xprt, clnt);
if (IS_ERR(new)) {
err = PTR_ERR(new);
goto out_err;
@@ -657,6 +671,7 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt,
{
const struct rpc_timeout *old_timeo;
rpc_authflavor_t pseudoflavor;
+ struct rpc_xprt_switch *xps, *oldxps;
struct rpc_xprt *xprt, *old;
struct rpc_clnt *parent;
int err;
@@ -668,10 +683,17 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt,
return PTR_ERR(xprt);
}
+ xps = xprt_switch_alloc(xprt, GFP_KERNEL);
+ if (xps == NULL) {
+ xprt_put(xprt);
+ return -ENOMEM;
+ }
+
pseudoflavor = clnt->cl_auth->au_flavor;
old_timeo = clnt->cl_timeout;
old = rpc_clnt_set_transport(clnt, xprt, timeout);
+ oldxps = xprt_iter_xchg_switch(&clnt->cl_xpi, xps);
rpc_unregister_client(clnt);
__rpc_clnt_remove_pipedir(clnt);
@@ -697,20 +719,74 @@ int rpc_switch_client_transport(struct rpc_clnt *clnt,
synchronize_rcu();
if (parent != clnt)
rpc_release_client(parent);
+ xprt_switch_put(oldxps);
xprt_put(old);
dprintk("RPC: replaced xprt for clnt %p\n", clnt);
return 0;
out_revert:
+ xps = xprt_iter_xchg_switch(&clnt->cl_xpi, oldxps);
rpc_clnt_set_transport(clnt, old, old_timeo);
clnt->cl_parent = parent;
rpc_client_register(clnt, pseudoflavor, NULL);
+ xprt_switch_put(xps);
xprt_put(xprt);
dprintk("RPC: failed to switch xprt for clnt %p\n", clnt);
return err;
}
EXPORT_SYMBOL_GPL(rpc_switch_client_transport);
+static
+int rpc_clnt_xprt_iter_init(struct rpc_clnt *clnt, struct rpc_xprt_iter *xpi)
+{
+ struct rpc_xprt_switch *xps;
+
+ rcu_read_lock();
+ xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
+ rcu_read_unlock();
+ if (xps == NULL)
+ return -EAGAIN;
+ xprt_iter_init_listall(xpi, xps);
+ xprt_switch_put(xps);
+ return 0;
+}
+
+/**
+ * rpc_clnt_iterate_for_each_xprt - Apply a function to all transports
+ * @clnt: pointer to client
+ * @fn: function to apply
+ * @data: void pointer to function data
+ *
+ * Iterates through the list of RPC transports currently attached to the
+ * client and applies the function fn(clnt, xprt, data).
+ *
+ * On error, the iteration stops, and the function returns the error value.
+ */
+int rpc_clnt_iterate_for_each_xprt(struct rpc_clnt *clnt,
+ int (*fn)(struct rpc_clnt *, struct rpc_xprt *, void *),
+ void *data)
+{
+ struct rpc_xprt_iter xpi;
+ int ret;
+
+ ret = rpc_clnt_xprt_iter_init(clnt, &xpi);
+ if (ret)
+ return ret;
+ for (;;) {
+ struct rpc_xprt *xprt = xprt_iter_get_next(&xpi);
+
+ if (!xprt)
+ break;
+ ret = fn(clnt, xprt, data);
+ xprt_put(xprt);
+ if (ret < 0)
+ break;
+ }
+ xprt_iter_destroy(&xpi);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_iterate_for_each_xprt);
+
/*
* Kill all tasks for the given client.
* XXX: kill their descendants as well?
@@ -783,6 +859,7 @@ rpc_free_client(struct rpc_clnt *clnt)
rpc_free_iostats(clnt->cl_metrics);
clnt->cl_metrics = NULL;
xprt_put(rcu_dereference_raw(clnt->cl_xprt));
+ xprt_iter_destroy(&clnt->cl_xpi);
rpciod_down();
rpc_free_clid(clnt);
kfree(clnt);
@@ -868,6 +945,7 @@ EXPORT_SYMBOL_GPL(rpc_bind_new_program);
void rpc_task_release_client(struct rpc_task *task)
{
struct rpc_clnt *clnt = task->tk_client;
+ struct rpc_xprt *xprt = task->tk_xprt;
if (clnt != NULL) {
/* Remove from client task list */
@@ -878,13 +956,22 @@ void rpc_task_release_client(struct rpc_task *task)
rpc_release_client(clnt);
}
+
+ if (xprt != NULL) {
+ task->tk_xprt = NULL;
+
+ xprt_put(xprt);
+ }
}
static
void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
{
+
if (clnt != NULL) {
rpc_task_release_client(task);
+ if (task->tk_xprt == NULL)
+ task->tk_xprt = xprt_iter_get_next(&clnt->cl_xpi);
task->tk_client = clnt;
atomic_inc(&clnt->cl_count);
if (clnt->cl_softrtry)
@@ -900,14 +987,6 @@ void rpc_task_set_client(struct rpc_task *task, struct rpc_clnt *clnt)
}
}
-void rpc_task_reset_client(struct rpc_task *task, struct rpc_clnt *clnt)
-{
- rpc_task_release_client(task);
- rpc_task_set_client(task, clnt);
-}
-EXPORT_SYMBOL_GPL(rpc_task_reset_client);
-
-
static void
rpc_task_set_rpc_message(struct rpc_task *task, const struct rpc_message *msg)
{
@@ -1217,6 +1296,7 @@ static int rpc_anyaddr(int family, struct sockaddr *buf, size_t buflen)
return -EINVAL;
memcpy(buf, &rpc_in6addr_loopback,
sizeof(rpc_in6addr_loopback));
+ break;
default:
dprintk("RPC: %s: address family not supported\n",
__func__);
@@ -2103,11 +2183,9 @@ call_timeout(struct rpc_task *task)
}
if (RPC_IS_SOFT(task)) {
if (clnt->cl_chatty) {
- rcu_read_lock();
printk(KERN_NOTICE "%s: server %s not responding, timed out\n",
clnt->cl_program->name,
- rcu_dereference(clnt->cl_xprt)->servername);
- rcu_read_unlock();
+ task->tk_xprt->servername);
}
if (task->tk_flags & RPC_TASK_TIMEOUT)
rpc_exit(task, -ETIMEDOUT);
@@ -2119,11 +2197,9 @@ call_timeout(struct rpc_task *task)
if (!(task->tk_flags & RPC_CALL_MAJORSEEN)) {
task->tk_flags |= RPC_CALL_MAJORSEEN;
if (clnt->cl_chatty) {
- rcu_read_lock();
printk(KERN_NOTICE "%s: server %s not responding, still trying\n",
clnt->cl_program->name,
- rcu_dereference(clnt->cl_xprt)->servername);
- rcu_read_unlock();
+ task->tk_xprt->servername);
}
}
rpc_force_rebind(clnt);
@@ -2153,11 +2229,9 @@ call_decode(struct rpc_task *task)
if (task->tk_flags & RPC_CALL_MAJORSEEN) {
if (clnt->cl_chatty) {
- rcu_read_lock();
printk(KERN_NOTICE "%s: server %s OK\n",
clnt->cl_program->name,
- rcu_dereference(clnt->cl_xprt)->servername);
- rcu_read_unlock();
+ task->tk_xprt->servername);
}
task->tk_flags &= ~RPC_CALL_MAJORSEEN;
}
@@ -2311,11 +2385,9 @@ rpc_verify_header(struct rpc_task *task)
task->tk_action = call_bind;
goto out_retry;
case RPC_AUTH_TOOWEAK:
- rcu_read_lock();
printk(KERN_NOTICE "RPC: server %s requires stronger "
"authentication.\n",
- rcu_dereference(clnt->cl_xprt)->servername);
- rcu_read_unlock();
+ task->tk_xprt->servername);
break;
default:
dprintk("RPC: %5u %s: unknown auth error: %x\n",
@@ -2340,27 +2412,27 @@ rpc_verify_header(struct rpc_task *task)
case RPC_SUCCESS:
return p;
case RPC_PROG_UNAVAIL:
- dprintk_rcu("RPC: %5u %s: program %u is unsupported "
+ dprintk("RPC: %5u %s: program %u is unsupported "
"by server %s\n", task->tk_pid, __func__,
(unsigned int)clnt->cl_prog,
- rcu_dereference(clnt->cl_xprt)->servername);
+ task->tk_xprt->servername);
error = -EPFNOSUPPORT;
goto out_err;
case RPC_PROG_MISMATCH:
- dprintk_rcu("RPC: %5u %s: program %u, version %u unsupported "
+ dprintk("RPC: %5u %s: program %u, version %u unsupported "
"by server %s\n", task->tk_pid, __func__,
(unsigned int)clnt->cl_prog,
(unsigned int)clnt->cl_vers,
- rcu_dereference(clnt->cl_xprt)->servername);
+ task->tk_xprt->servername);
error = -EPROTONOSUPPORT;
goto out_err;
case RPC_PROC_UNAVAIL:
- dprintk_rcu("RPC: %5u %s: proc %s unsupported by program %u, "
+ dprintk("RPC: %5u %s: proc %s unsupported by program %u, "
"version %u on server %s\n",
task->tk_pid, __func__,
rpc_proc_name(task),
clnt->cl_prog, clnt->cl_vers,
- rcu_dereference(clnt->cl_xprt)->servername);
+ task->tk_xprt->servername);
error = -EOPNOTSUPP;
goto out_err;
case RPC_GARBAGE_ARGS:
@@ -2420,7 +2492,10 @@ static int rpc_ping(struct rpc_clnt *clnt)
return err;
}
-struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int flags)
+static
+struct rpc_task *rpc_call_null_helper(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt, struct rpc_cred *cred, int flags,
+ const struct rpc_call_ops *ops, void *data)
{
struct rpc_message msg = {
.rpc_proc = &rpcproc_null,
@@ -2428,14 +2503,140 @@ struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int
};
struct rpc_task_setup task_setup_data = {
.rpc_client = clnt,
+ .rpc_xprt = xprt,
.rpc_message = &msg,
- .callback_ops = &rpc_default_ops,
+ .callback_ops = (ops != NULL) ? ops : &rpc_default_ops,
+ .callback_data = data,
.flags = flags,
};
+
return rpc_run_task(&task_setup_data);
}
+
+struct rpc_task *rpc_call_null(struct rpc_clnt *clnt, struct rpc_cred *cred, int flags)
+{
+ return rpc_call_null_helper(clnt, NULL, cred, flags, NULL, NULL);
+}
EXPORT_SYMBOL_GPL(rpc_call_null);
+struct rpc_cb_add_xprt_calldata {
+ struct rpc_xprt_switch *xps;
+ struct rpc_xprt *xprt;
+};
+
+static void rpc_cb_add_xprt_done(struct rpc_task *task, void *calldata)
+{
+ struct rpc_cb_add_xprt_calldata *data = calldata;
+
+ if (task->tk_status == 0)
+ rpc_xprt_switch_add_xprt(data->xps, data->xprt);
+}
+
+static void rpc_cb_add_xprt_release(void *calldata)
+{
+ struct rpc_cb_add_xprt_calldata *data = calldata;
+
+ xprt_put(data->xprt);
+ xprt_switch_put(data->xps);
+ kfree(data);
+}
+
+const static struct rpc_call_ops rpc_cb_add_xprt_call_ops = {
+ .rpc_call_done = rpc_cb_add_xprt_done,
+ .rpc_release = rpc_cb_add_xprt_release,
+};
+
+/**
+ * rpc_clnt_test_and_add_xprt - Test and add a new transport to a rpc_clnt
+ * @clnt: pointer to struct rpc_clnt
+ * @xps: pointer to struct rpc_xprt_switch,
+ * @xprt: pointer struct rpc_xprt
+ * @dummy: unused
+ */
+int rpc_clnt_test_and_add_xprt(struct rpc_clnt *clnt,
+ struct rpc_xprt_switch *xps, struct rpc_xprt *xprt,
+ void *dummy)
+{
+ struct rpc_cb_add_xprt_calldata *data;
+ struct rpc_cred *cred;
+ struct rpc_task *task;
+
+ data = kmalloc(sizeof(*data), GFP_NOFS);
+ if (!data)
+ return -ENOMEM;
+ data->xps = xprt_switch_get(xps);
+ data->xprt = xprt_get(xprt);
+
+ cred = authnull_ops.lookup_cred(NULL, NULL, 0);
+ task = rpc_call_null_helper(clnt, xprt, cred,
+ RPC_TASK_SOFT|RPC_TASK_SOFTCONN|RPC_TASK_ASYNC,
+ &rpc_cb_add_xprt_call_ops, data);
+ put_rpccred(cred);
+ if (IS_ERR(task))
+ return PTR_ERR(task);
+ rpc_put_task(task);
+ return 1;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_test_and_add_xprt);
+
+/**
+ * rpc_clnt_add_xprt - Add a new transport to a rpc_clnt
+ * @clnt: pointer to struct rpc_clnt
+ * @xprtargs: pointer to struct xprt_create
+ * @setup: callback to test and/or set up the connection
+ * @data: pointer to setup function data
+ *
+ * Creates a new transport using the parameters set in args and
+ * adds it to clnt.
+ * If ping is set, then test that connectivity succeeds before
+ * adding the new transport.
+ *
+ */
+int rpc_clnt_add_xprt(struct rpc_clnt *clnt,
+ struct xprt_create *xprtargs,
+ int (*setup)(struct rpc_clnt *,
+ struct rpc_xprt_switch *,
+ struct rpc_xprt *,
+ void *),
+ void *data)
+{
+ struct rpc_xprt_switch *xps;
+ struct rpc_xprt *xprt;
+ unsigned char resvport;
+ int ret = 0;
+
+ rcu_read_lock();
+ xps = xprt_switch_get(rcu_dereference(clnt->cl_xpi.xpi_xpswitch));
+ xprt = xprt_iter_xprt(&clnt->cl_xpi);
+ if (xps == NULL || xprt == NULL) {
+ rcu_read_unlock();
+ return -EAGAIN;
+ }
+ resvport = xprt->resvport;
+ rcu_read_unlock();
+
+ xprt = xprt_create_transport(xprtargs);
+ if (IS_ERR(xprt)) {
+ ret = PTR_ERR(xprt);
+ goto out_put_switch;
+ }
+ xprt->resvport = resvport;
+
+ rpc_xprt_switch_set_roundrobin(xps);
+ if (setup) {
+ ret = setup(clnt, xps, xprt, data);
+ if (ret != 0)
+ goto out_put_xprt;
+ }
+ rpc_xprt_switch_add_xprt(xps, xprt);
+out_put_xprt:
+ xprt_put(xprt);
+out_put_switch:
+ xprt_switch_put(xps);
+ return ret;
+}
+EXPORT_SYMBOL_GPL(rpc_clnt_add_xprt);
+
#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
static void rpc_show_header(void)
{
@@ -2482,57 +2683,39 @@ void rpc_show_tasks(struct net *net)
#endif
#if IS_ENABLED(CONFIG_SUNRPC_SWAP)
+static int
+rpc_clnt_swap_activate_callback(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ void *dummy)
+{
+ return xprt_enable_swap(xprt);
+}
+
int
rpc_clnt_swap_activate(struct rpc_clnt *clnt)
{
- int ret = 0;
- struct rpc_xprt *xprt;
-
- if (atomic_inc_return(&clnt->cl_swapper) == 1) {
-retry:
- rcu_read_lock();
- xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
- rcu_read_unlock();
- if (!xprt) {
- /*
- * If we didn't get a reference, then we likely are
- * racing with a migration event. Wait for a grace
- * period and try again.
- */
- synchronize_rcu();
- goto retry;
- }
-
- ret = xprt_enable_swap(xprt);
- xprt_put(xprt);
- }
- return ret;
+ if (atomic_inc_return(&clnt->cl_swapper) == 1)
+ return rpc_clnt_iterate_for_each_xprt(clnt,
+ rpc_clnt_swap_activate_callback, NULL);
+ return 0;
}
EXPORT_SYMBOL_GPL(rpc_clnt_swap_activate);
+static int
+rpc_clnt_swap_deactivate_callback(struct rpc_clnt *clnt,
+ struct rpc_xprt *xprt,
+ void *dummy)
+{
+ xprt_disable_swap(xprt);
+ return 0;
+}
+
void
rpc_clnt_swap_deactivate(struct rpc_clnt *clnt)
{
- struct rpc_xprt *xprt;
-
- if (atomic_dec_if_positive(&clnt->cl_swapper) == 0) {
-retry:
- rcu_read_lock();
- xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
- rcu_read_unlock();
- if (!xprt) {
- /*
- * If we didn't get a reference, then we likely are
- * racing with a migration event. Wait for a grace
- * period and try again.
- */
- synchronize_rcu();
- goto retry;
- }
-
- xprt_disable_swap(xprt);
- xprt_put(xprt);
- }
+ if (atomic_dec_if_positive(&clnt->cl_swapper) == 0)
+ rpc_clnt_iterate_for_each_xprt(clnt,
+ rpc_clnt_swap_deactivate_callback, NULL);
}
EXPORT_SYMBOL_GPL(rpc_clnt_swap_deactivate);
#endif /* CONFIG_SUNRPC_SWAP */
diff --git a/net/sunrpc/rpc_pipe.c b/net/sunrpc/rpc_pipe.c
index d81186d34558..fc48eca21fd2 100644
--- a/net/sunrpc/rpc_pipe.c
+++ b/net/sunrpc/rpc_pipe.c
@@ -172,7 +172,7 @@ rpc_close_pipes(struct inode *inode)
int need_release;
LIST_HEAD(free_list);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
spin_lock(&pipe->lock);
need_release = pipe->nreaders != 0 || pipe->nwriters != 0;
pipe->nreaders = 0;
@@ -188,7 +188,7 @@ rpc_close_pipes(struct inode *inode)
cancel_delayed_work_sync(&pipe->queue_timeout);
rpc_inode_setowner(inode, NULL);
RPC_I(inode)->pipe = NULL;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
}
static struct inode *
@@ -221,7 +221,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp)
int first_open;
int res = -ENXIO;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
pipe = RPC_I(inode)->pipe;
if (pipe == NULL)
goto out;
@@ -237,7 +237,7 @@ rpc_pipe_open(struct inode *inode, struct file *filp)
pipe->nwriters++;
res = 0;
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return res;
}
@@ -248,7 +248,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
struct rpc_pipe_msg *msg;
int last_close;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
pipe = RPC_I(inode)->pipe;
if (pipe == NULL)
goto out;
@@ -278,7 +278,7 @@ rpc_pipe_release(struct inode *inode, struct file *filp)
if (last_close && pipe->ops->release_pipe)
pipe->ops->release_pipe(inode);
out:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return 0;
}
@@ -290,7 +290,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
struct rpc_pipe_msg *msg;
int res = 0;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
pipe = RPC_I(inode)->pipe;
if (pipe == NULL) {
res = -EPIPE;
@@ -322,7 +322,7 @@ rpc_pipe_read(struct file *filp, char __user *buf, size_t len, loff_t *offset)
pipe->ops->destroy_msg(msg);
}
out_unlock:
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return res;
}
@@ -332,11 +332,11 @@ rpc_pipe_write(struct file *filp, const char __user *buf, size_t len, loff_t *of
struct inode *inode = file_inode(filp);
int res;
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
res = -EPIPE;
if (RPC_I(inode)->pipe != NULL)
res = RPC_I(inode)->pipe->ops->downcall(filp, buf, len);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return res;
}
@@ -349,12 +349,12 @@ rpc_pipe_poll(struct file *filp, struct poll_table_struct *wait)
poll_wait(filp, &rpci->waitq, wait);
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
if (rpci->pipe == NULL)
mask |= POLLERR | POLLHUP;
else if (filp->private_data || !list_empty(&rpci->pipe->pipe))
mask |= POLLIN | POLLRDNORM;
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return mask;
}
@@ -367,10 +367,10 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
switch (cmd) {
case FIONREAD:
- mutex_lock(&inode->i_mutex);
+ inode_lock(inode);
pipe = RPC_I(inode)->pipe;
if (pipe == NULL) {
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return -EPIPE;
}
spin_lock(&pipe->lock);
@@ -381,7 +381,7 @@ rpc_pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg)
len += msg->len - msg->copied;
}
spin_unlock(&pipe->lock);
- mutex_unlock(&inode->i_mutex);
+ inode_unlock(inode);
return put_user(len, (int __user *)arg);
default:
return -EINVAL;
@@ -617,9 +617,9 @@ int rpc_rmdir(struct dentry *dentry)
parent = dget_parent(dentry);
dir = d_inode(parent);
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
error = __rpc_rmdir(dir, dentry);
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
dput(parent);
return error;
}
@@ -701,9 +701,9 @@ static void rpc_depopulate(struct dentry *parent,
{
struct inode *dir = d_inode(parent);
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_CHILD);
+ inode_lock_nested(dir, I_MUTEX_CHILD);
__rpc_depopulate(parent, files, start, eof);
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
}
static int rpc_populate(struct dentry *parent,
@@ -715,7 +715,7 @@ static int rpc_populate(struct dentry *parent,
struct dentry *dentry;
int i, err;
- mutex_lock(&dir->i_mutex);
+ inode_lock(dir);
for (i = start; i < eof; i++) {
dentry = __rpc_lookup_create_exclusive(parent, files[i].name);
err = PTR_ERR(dentry);
@@ -739,11 +739,11 @@ static int rpc_populate(struct dentry *parent,
if (err != 0)
goto out_bad;
}
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
return 0;
out_bad:
__rpc_depopulate(parent, files, start, eof);
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
printk(KERN_WARNING "%s: %s failed to populate directory %pd\n",
__FILE__, __func__, parent);
return err;
@@ -757,7 +757,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent,
struct inode *dir = d_inode(parent);
int error;
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
dentry = __rpc_lookup_create_exclusive(parent, name);
if (IS_ERR(dentry))
goto out;
@@ -770,7 +770,7 @@ static struct dentry *rpc_mkdir_populate(struct dentry *parent,
goto err_rmdir;
}
out:
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
return dentry;
err_rmdir:
__rpc_rmdir(dir, dentry);
@@ -788,11 +788,11 @@ static int rpc_rmdir_depopulate(struct dentry *dentry,
parent = dget_parent(dentry);
dir = d_inode(parent);
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
if (depopulate != NULL)
depopulate(dentry);
error = __rpc_rmdir(dir, dentry);
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
dput(parent);
return error;
}
@@ -828,7 +828,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
if (pipe->ops->downcall == NULL)
umode &= ~S_IWUGO;
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
dentry = __rpc_lookup_create_exclusive(parent, name);
if (IS_ERR(dentry))
goto out;
@@ -837,7 +837,7 @@ struct dentry *rpc_mkpipe_dentry(struct dentry *parent, const char *name,
if (err)
goto out_err;
out:
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
return dentry;
out_err:
dentry = ERR_PTR(err);
@@ -865,9 +865,9 @@ rpc_unlink(struct dentry *dentry)
parent = dget_parent(dentry);
dir = d_inode(parent);
- mutex_lock_nested(&dir->i_mutex, I_MUTEX_PARENT);
+ inode_lock_nested(dir, I_MUTEX_PARENT);
error = __rpc_rmpipe(dir, dentry);
- mutex_unlock(&dir->i_mutex);
+ inode_unlock(dir);
dput(parent);
return error;
}
@@ -1390,8 +1390,8 @@ rpc_fill_super(struct super_block *sb, void *data, int silent)
struct sunrpc_net *sn = net_generic(net, sunrpc_net_id);
int err;
- sb->s_blocksize = PAGE_CACHE_SIZE;
- sb->s_blocksize_bits = PAGE_CACHE_SHIFT;
+ sb->s_blocksize = PAGE_SIZE;
+ sb->s_blocksize_bits = PAGE_SHIFT;
sb->s_magic = RPCAUTH_GSSMAGIC;
sb->s_op = &s_ops;
sb->s_d_op = &simple_dentry_operations;
@@ -1500,7 +1500,7 @@ int register_rpc_pipefs(void)
rpc_inode_cachep = kmem_cache_create("rpc_inode_cache",
sizeof(struct rpc_inode),
0, (SLAB_HWCACHE_ALIGN|SLAB_RECLAIM_ACCOUNT|
- SLAB_MEM_SPREAD),
+ SLAB_MEM_SPREAD|SLAB_ACCOUNT),
init_once);
if (!rpc_inode_cachep)
return -ENOMEM;
diff --git a/net/sunrpc/rpcb_clnt.c b/net/sunrpc/rpcb_clnt.c
index cf5770d8f49a..5b30603596d0 100644
--- a/net/sunrpc/rpcb_clnt.c
+++ b/net/sunrpc/rpcb_clnt.c
@@ -648,10 +648,10 @@ static struct rpc_task *rpcb_call_async(struct rpc_clnt *rpcb_clnt, struct rpcbi
static struct rpc_clnt *rpcb_find_transport_owner(struct rpc_clnt *clnt)
{
struct rpc_clnt *parent = clnt->cl_parent;
- struct rpc_xprt *xprt = rcu_dereference(clnt->cl_xprt);
+ struct rpc_xprt_switch *xps = rcu_access_pointer(clnt->cl_xpi.xpi_xpswitch);
while (parent != clnt) {
- if (rcu_dereference(parent->cl_xprt) != xprt)
+ if (rcu_access_pointer(parent->cl_xpi.xpi_xpswitch) != xps)
break;
if (clnt->cl_autobind)
break;
@@ -683,11 +683,9 @@ void rpcb_getport_async(struct rpc_task *task)
int status;
rcu_read_lock();
- do {
- clnt = rpcb_find_transport_owner(task->tk_client);
- xprt = xprt_get(rcu_dereference(clnt->cl_xprt));
- } while (xprt == NULL);
+ clnt = rpcb_find_transport_owner(task->tk_client);
rcu_read_unlock();
+ xprt = xprt_get(task->tk_xprt);
dprintk("RPC: %5u %s(%s, %u, %u, %d)\n",
task->tk_pid, __func__,
diff --git a/net/sunrpc/sched.c b/net/sunrpc/sched.c
index f14f24ee9983..fcfd48d263f6 100644
--- a/net/sunrpc/sched.c
+++ b/net/sunrpc/sched.c
@@ -250,11 +250,11 @@ void rpc_destroy_wait_queue(struct rpc_wait_queue *queue)
}
EXPORT_SYMBOL_GPL(rpc_destroy_wait_queue);
-static int rpc_wait_bit_killable(struct wait_bit_key *key)
+static int rpc_wait_bit_killable(struct wait_bit_key *key, int mode)
{
- if (fatal_signal_pending(current))
- return -ERESTARTSYS;
freezable_schedule_unsafe();
+ if (signal_pending_state(mode, current))
+ return -ERESTARTSYS;
return 0;
}
@@ -909,6 +909,8 @@ static void rpc_init_task(struct rpc_task *task, const struct rpc_task_setup *ta
/* Initialize workqueue for async tasks */
task->tk_workqueue = task_setup_data->workqueue;
+ task->tk_xprt = xprt_get(task_setup_data->rpc_xprt);
+
if (task->tk_ops->rpc_call_prepare != NULL)
task->tk_action = rpc_prepare_task;
diff --git a/net/sunrpc/socklib.c b/net/sunrpc/socklib.c
index 2df87f78e518..de70c78025d7 100644
--- a/net/sunrpc/socklib.c
+++ b/net/sunrpc/socklib.c
@@ -96,8 +96,8 @@ ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct
if (base || xdr->page_base) {
pglen -= base;
base += xdr->page_base;
- ppage += base >> PAGE_CACHE_SHIFT;
- base &= ~PAGE_CACHE_MASK;
+ ppage += base >> PAGE_SHIFT;
+ base &= ~PAGE_MASK;
}
do {
char *kaddr;
@@ -113,7 +113,7 @@ ssize_t xdr_partial_copy_from_skb(struct xdr_buf *xdr, unsigned int base, struct
}
}
- len = PAGE_CACHE_SIZE;
+ len = PAGE_SIZE;
kaddr = kmap_atomic(*ppage);
if (base) {
len -= base;
diff --git a/net/sunrpc/svc.c b/net/sunrpc/svc.c
index a8f579df14d8..cc9852897395 100644
--- a/net/sunrpc/svc.c
+++ b/net/sunrpc/svc.c
@@ -1364,14 +1364,22 @@ bc_svc_process(struct svc_serv *serv, struct rpc_rqst *req,
memcpy(&rqstp->rq_arg, &req->rq_rcv_buf, sizeof(rqstp->rq_arg));
memcpy(&rqstp->rq_res, &req->rq_snd_buf, sizeof(rqstp->rq_res));
+ /* Adjust the argument buffer length */
+ rqstp->rq_arg.len = req->rq_private_buf.len;
+ if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len) {
+ rqstp->rq_arg.head[0].iov_len = rqstp->rq_arg.len;
+ rqstp->rq_arg.page_len = 0;
+ } else if (rqstp->rq_arg.len <= rqstp->rq_arg.head[0].iov_len +
+ rqstp->rq_arg.page_len)
+ rqstp->rq_arg.page_len = rqstp->rq_arg.len -
+ rqstp->rq_arg.head[0].iov_len;
+ else
+ rqstp->rq_arg.len = rqstp->rq_arg.head[0].iov_len +
+ rqstp->rq_arg.page_len;
+
/* reset result send buffer "put" position */
resv->iov_len = 0;
- if (rqstp->rq_prot != IPPROTO_TCP) {
- printk(KERN_ERR "No support for Non-TCP transports!\n");
- BUG();
- }
-
/*
* Skip the next two words because they've already been
* processed in the transport
diff --git a/net/sunrpc/svc_xprt.c b/net/sunrpc/svc_xprt.c
index a6cbb2104667..7422f28818b2 100644
--- a/net/sunrpc/svc_xprt.c
+++ b/net/sunrpc/svc_xprt.c
@@ -10,11 +10,13 @@
#include <linux/kthread.h>
#include <linux/slab.h>
#include <net/sock.h>
+#include <linux/sunrpc/addr.h>
#include <linux/sunrpc/stats.h>
#include <linux/sunrpc/svc_xprt.h>
#include <linux/sunrpc/svcsock.h>
#include <linux/sunrpc/xprt.h>
#include <linux/module.h>
+#include <linux/netdevice.h>
#include <trace/events/sunrpc.h>
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
@@ -938,6 +940,49 @@ static void svc_age_temp_xprts(unsigned long closure)
mod_timer(&serv->sv_temptimer, jiffies + svc_conn_age_period * HZ);
}
+/* Close temporary transports whose xpt_local matches server_addr immediately
+ * instead of waiting for them to be picked up by the timer.
+ *
+ * This is meant to be called from a notifier_block that runs when an ip
+ * address is deleted.
+ */
+void svc_age_temp_xprts_now(struct svc_serv *serv, struct sockaddr *server_addr)
+{
+ struct svc_xprt *xprt;
+ struct svc_sock *svsk;
+ struct socket *sock;
+ struct list_head *le, *next;
+ LIST_HEAD(to_be_closed);
+ struct linger no_linger = {
+ .l_onoff = 1,
+ .l_linger = 0,
+ };
+
+ spin_lock_bh(&serv->sv_lock);
+ list_for_each_safe(le, next, &serv->sv_tempsocks) {
+ xprt = list_entry(le, struct svc_xprt, xpt_list);
+ if (rpc_cmp_addr(server_addr, (struct sockaddr *)
+ &xprt->xpt_local)) {
+ dprintk("svc_age_temp_xprts_now: found %p\n", xprt);
+ list_move(le, &to_be_closed);
+ }
+ }
+ spin_unlock_bh(&serv->sv_lock);
+
+ while (!list_empty(&to_be_closed)) {
+ le = to_be_closed.next;
+ list_del_init(le);
+ xprt = list_entry(le, struct svc_xprt, xpt_list);
+ dprintk("svc_age_temp_xprts_now: closing %p\n", xprt);
+ svsk = container_of(xprt, struct svc_sock, sk_xprt);
+ sock = svsk->sk_sock;
+ kernel_setsockopt(sock, SOL_SOCKET, SO_LINGER,
+ (char *)&no_linger, sizeof(no_linger));
+ svc_close_xprt(xprt);
+ }
+}
+EXPORT_SYMBOL_GPL(svc_age_temp_xprts_now);
+
static void call_xpt_users(struct svc_xprt *xprt)
{
struct svc_xpt_user *u;
diff --git a/net/sunrpc/svcauth.c b/net/sunrpc/svcauth.c
index 79c0f3459b5c..69841db1f533 100644
--- a/net/sunrpc/svcauth.c
+++ b/net/sunrpc/svcauth.c
@@ -55,6 +55,7 @@ svc_authenticate(struct svc_rqst *rqstp, __be32 *authp)
spin_unlock(&authtab_lock);
rqstp->rq_auth_slack = 0;
+ init_svc_cred(&rqstp->rq_cred);
rqstp->rq_authop = aops;
return aops->accept(rqstp, authp);
@@ -63,6 +64,7 @@ EXPORT_SYMBOL_GPL(svc_authenticate);
int svc_set_client(struct svc_rqst *rqstp)
{
+ rqstp->rq_client = NULL;
return rqstp->rq_authop->set_client(rqstp);
}
EXPORT_SYMBOL_GPL(svc_set_client);
diff --git a/net/sunrpc/svcauth_unix.c b/net/sunrpc/svcauth_unix.c
index 621ca7b4a155..dfacdc95b3f5 100644
--- a/net/sunrpc/svcauth_unix.c
+++ b/net/sunrpc/svcauth_unix.c
@@ -728,10 +728,6 @@ svcauth_null_accept(struct svc_rqst *rqstp, __be32 *authp)
struct kvec *resv = &rqstp->rq_res.head[0];
struct svc_cred *cred = &rqstp->rq_cred;
- cred->cr_group_info = NULL;
- cred->cr_principal = NULL;
- rqstp->rq_client = NULL;
-
if (argv->iov_len < 3*4)
return SVC_GARBAGE;
@@ -794,10 +790,6 @@ svcauth_unix_accept(struct svc_rqst *rqstp, __be32 *authp)
u32 slen, i;
int len = argv->iov_len;
- cred->cr_group_info = NULL;
- cred->cr_principal = NULL;
- rqstp->rq_client = NULL;
-
if ((len -= 3*4) < 0)
return SVC_GARBAGE;
diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
index 0c8120229a03..1413cdcc131c 100644
--- a/net/sunrpc/svcsock.c
+++ b/net/sunrpc/svcsock.c
@@ -181,7 +181,7 @@ int svc_send_common(struct socket *sock, struct xdr_buf *xdr,
struct page **ppage = xdr->pages;
size_t base = xdr->page_base;
unsigned int pglen = xdr->page_len;
- unsigned int flags = MSG_MORE;
+ unsigned int flags = MSG_MORE | MSG_SENDPAGE_NOTLAST;
int slen;
int len = 0;
@@ -399,6 +399,31 @@ static int svc_sock_secure_port(struct svc_rqst *rqstp)
return svc_port_is_privileged(svc_addr(rqstp));
}
+static bool sunrpc_waitqueue_active(wait_queue_head_t *wq)
+{
+ if (!wq)
+ return false;
+ /*
+ * There should normally be a memory * barrier here--see
+ * wq_has_sleeper().
+ *
+ * It appears that isn't currently necessary, though, basically
+ * because callers all appear to have sufficient memory barriers
+ * between the time the relevant change is made and the
+ * time they call these callbacks.
+ *
+ * The nfsd code itself doesn't actually explicitly wait on
+ * these waitqueues, but it may wait on them for example in
+ * sendpage() or sendmsg() calls. (And those may be the only
+ * places, since it it uses nonblocking reads.)
+ *
+ * Maybe we should add the memory barriers anyway, but these are
+ * hot paths so we'd need to be convinced there's no sigificant
+ * penalty.
+ */
+ return waitqueue_active(wq);
+}
+
/*
* INET callback when data has been received on the socket.
*/
@@ -414,7 +439,7 @@ static void svc_udp_data_ready(struct sock *sk)
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (wq && waitqueue_active(wq))
+ if (sunrpc_waitqueue_active(wq))
wake_up_interruptible(wq);
}
@@ -432,7 +457,7 @@ static void svc_write_space(struct sock *sk)
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (wq && waitqueue_active(wq)) {
+ if (sunrpc_waitqueue_active(wq)) {
dprintk("RPC svc_write_space: someone sleeping on %p\n",
svsk);
wake_up_interruptible(wq);
@@ -787,7 +812,7 @@ static void svc_tcp_listen_data_ready(struct sock *sk)
}
wq = sk_sleep(sk);
- if (wq && waitqueue_active(wq))
+ if (sunrpc_waitqueue_active(wq))
wake_up_interruptible_all(wq);
}
@@ -808,7 +833,7 @@ static void svc_tcp_state_change(struct sock *sk)
set_bit(XPT_CLOSE, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (wq && waitqueue_active(wq))
+ if (sunrpc_waitqueue_active(wq))
wake_up_interruptible_all(wq);
}
@@ -823,7 +848,7 @@ static void svc_tcp_data_ready(struct sock *sk)
set_bit(XPT_DATA, &svsk->sk_xprt.xpt_flags);
svc_xprt_enqueue(&svsk->sk_xprt);
}
- if (wq && waitqueue_active(wq))
+ if (sunrpc_waitqueue_active(wq))
wake_up_interruptible(wq);
}
@@ -1367,7 +1392,6 @@ EXPORT_SYMBOL_GPL(svc_sock_update_bufs);
/*
* Initialize socket for RPC use and create svc_sock struct
- * XXX: May want to setsockopt SO_SNDBUF and SO_RCVBUF.
*/
static struct svc_sock *svc_setup_socket(struct svc_serv *serv,
struct socket *sock,
@@ -1594,7 +1618,7 @@ static void svc_sock_detach(struct svc_xprt *xprt)
sk->sk_write_space = svsk->sk_owspace;
wq = sk_sleep(sk);
- if (wq && waitqueue_active(wq))
+ if (sunrpc_waitqueue_active(wq))
wake_up_interruptible(wq);
}
diff --git a/net/sunrpc/sysctl.c b/net/sunrpc/sysctl.c
index 887f0183b4c6..c88d9bc06f5c 100644
--- a/net/sunrpc/sysctl.c
+++ b/net/sunrpc/sysctl.c
@@ -76,7 +76,7 @@ static int
proc_dodebug(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp, loff_t *ppos)
{
- char tmpbuf[20], c, *s;
+ char tmpbuf[20], c, *s = NULL;
char __user *p;
unsigned int value;
size_t left, len;
@@ -103,23 +103,24 @@ proc_dodebug(struct ctl_table *table, int write,
return -EFAULT;
tmpbuf[left] = '\0';
- for (s = tmpbuf, value = 0; '0' <= *s && *s <= '9'; s++, left--)
- value = 10 * value + (*s - '0');
- if (*s && !isspace(*s))
- return -EINVAL;
- while (left && isspace(*s))
- left--, s++;
+ value = simple_strtol(tmpbuf, &s, 0);
+ if (s) {
+ left -= (s - tmpbuf);
+ if (left && !isspace(*s))
+ return -EINVAL;
+ while (left && isspace(*s))
+ left--, s++;
+ } else
+ left = 0;
*(unsigned int *) table->data = value;
/* Display the RPC tasks on writing to rpc_debug */
if (strcmp(table->procname, "rpc_debug") == 0)
rpc_show_tasks(&init_net);
} else {
- if (!access_ok(VERIFY_WRITE, buffer, left))
- return -EFAULT;
- len = sprintf(tmpbuf, "%d", *(unsigned int *) table->data);
+ len = sprintf(tmpbuf, "0x%04x", *(unsigned int *) table->data);
if (len > left)
len = left;
- if (__copy_to_user(buffer, tmpbuf, len))
+ if (copy_to_user(buffer, tmpbuf, len))
return -EFAULT;
if ((left -= len) > 0) {
if (put_user('\n', (char __user *)buffer + len))
diff --git a/net/sunrpc/xdr.c b/net/sunrpc/xdr.c
index 4439ac4c1b53..6bdb3865212d 100644
--- a/net/sunrpc/xdr.c
+++ b/net/sunrpc/xdr.c
@@ -164,7 +164,7 @@ EXPORT_SYMBOL_GPL(xdr_inline_pages);
* Note: the addresses pgto_base and pgfrom_base are both calculated in
* the same way:
* if a memory area starts at byte 'base' in page 'pages[i]',
- * then its address is given as (i << PAGE_CACHE_SHIFT) + base
+ * then its address is given as (i << PAGE_SHIFT) + base
* Also note: pgfrom_base must be < pgto_base, but the memory areas
* they point to may overlap.
*/
@@ -181,20 +181,20 @@ _shift_data_right_pages(struct page **pages, size_t pgto_base,
pgto_base += len;
pgfrom_base += len;
- pgto = pages + (pgto_base >> PAGE_CACHE_SHIFT);
- pgfrom = pages + (pgfrom_base >> PAGE_CACHE_SHIFT);
+ pgto = pages + (pgto_base >> PAGE_SHIFT);
+ pgfrom = pages + (pgfrom_base >> PAGE_SHIFT);
- pgto_base &= ~PAGE_CACHE_MASK;
- pgfrom_base &= ~PAGE_CACHE_MASK;
+ pgto_base &= ~PAGE_MASK;
+ pgfrom_base &= ~PAGE_MASK;
do {
/* Are any pointers crossing a page boundary? */
if (pgto_base == 0) {
- pgto_base = PAGE_CACHE_SIZE;
+ pgto_base = PAGE_SIZE;
pgto--;
}
if (pgfrom_base == 0) {
- pgfrom_base = PAGE_CACHE_SIZE;
+ pgfrom_base = PAGE_SIZE;
pgfrom--;
}
@@ -236,11 +236,11 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len)
char *vto;
size_t copy;
- pgto = pages + (pgbase >> PAGE_CACHE_SHIFT);
- pgbase &= ~PAGE_CACHE_MASK;
+ pgto = pages + (pgbase >> PAGE_SHIFT);
+ pgbase &= ~PAGE_MASK;
for (;;) {
- copy = PAGE_CACHE_SIZE - pgbase;
+ copy = PAGE_SIZE - pgbase;
if (copy > len)
copy = len;
@@ -253,7 +253,7 @@ _copy_to_pages(struct page **pages, size_t pgbase, const char *p, size_t len)
break;
pgbase += copy;
- if (pgbase == PAGE_CACHE_SIZE) {
+ if (pgbase == PAGE_SIZE) {
flush_dcache_page(*pgto);
pgbase = 0;
pgto++;
@@ -280,11 +280,11 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
char *vfrom;
size_t copy;
- pgfrom = pages + (pgbase >> PAGE_CACHE_SHIFT);
- pgbase &= ~PAGE_CACHE_MASK;
+ pgfrom = pages + (pgbase >> PAGE_SHIFT);
+ pgbase &= ~PAGE_MASK;
do {
- copy = PAGE_CACHE_SIZE - pgbase;
+ copy = PAGE_SIZE - pgbase;
if (copy > len)
copy = len;
@@ -293,7 +293,7 @@ _copy_from_pages(char *p, struct page **pages, size_t pgbase, size_t len)
kunmap_atomic(vfrom);
pgbase += copy;
- if (pgbase == PAGE_CACHE_SIZE) {
+ if (pgbase == PAGE_SIZE) {
pgbase = 0;
pgfrom++;
}
@@ -1038,8 +1038,8 @@ xdr_buf_subsegment(struct xdr_buf *buf, struct xdr_buf *subbuf,
if (base < buf->page_len) {
subbuf->page_len = min(buf->page_len - base, len);
base += buf->page_base;
- subbuf->page_base = base & ~PAGE_CACHE_MASK;
- subbuf->pages = &buf->pages[base >> PAGE_CACHE_SHIFT];
+ subbuf->page_base = base & ~PAGE_MASK;
+ subbuf->pages = &buf->pages[base >> PAGE_SHIFT];
len -= subbuf->page_len;
base = 0;
} else {
@@ -1297,9 +1297,9 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
todo -= avail_here;
base += buf->page_base;
- ppages = buf->pages + (base >> PAGE_CACHE_SHIFT);
- base &= ~PAGE_CACHE_MASK;
- avail_page = min_t(unsigned int, PAGE_CACHE_SIZE - base,
+ ppages = buf->pages + (base >> PAGE_SHIFT);
+ base &= ~PAGE_MASK;
+ avail_page = min_t(unsigned int, PAGE_SIZE - base,
avail_here);
c = kmap(*ppages) + base;
@@ -1383,7 +1383,7 @@ xdr_xcode_array2(struct xdr_buf *buf, unsigned int base,
}
avail_page = min(avail_here,
- (unsigned int) PAGE_CACHE_SIZE);
+ (unsigned int) PAGE_SIZE);
}
base = buf->page_len; /* align to start of tail */
}
@@ -1479,9 +1479,9 @@ xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len,
if (page_len > len)
page_len = len;
len -= page_len;
- page_offset = (offset + buf->page_base) & (PAGE_CACHE_SIZE - 1);
- i = (offset + buf->page_base) >> PAGE_CACHE_SHIFT;
- thislen = PAGE_CACHE_SIZE - page_offset;
+ page_offset = (offset + buf->page_base) & (PAGE_SIZE - 1);
+ i = (offset + buf->page_base) >> PAGE_SHIFT;
+ thislen = PAGE_SIZE - page_offset;
do {
if (thislen > page_len)
thislen = page_len;
@@ -1492,7 +1492,7 @@ xdr_process_buf(struct xdr_buf *buf, unsigned int offset, unsigned int len,
page_len -= thislen;
i++;
page_offset = 0;
- thislen = PAGE_CACHE_SIZE;
+ thislen = PAGE_SIZE;
} while (page_len != 0);
offset = 0;
}
diff --git a/net/sunrpc/xprt.c b/net/sunrpc/xprt.c
index 2e98f4a243e5..216a1385718a 100644
--- a/net/sunrpc/xprt.c
+++ b/net/sunrpc/xprt.c
@@ -48,6 +48,7 @@
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/metrics.h>
#include <linux/sunrpc/bc_xprt.h>
+#include <linux/rcupdate.h>
#include <trace/events/sunrpc.h>
@@ -1166,7 +1167,7 @@ void xprt_free(struct rpc_xprt *xprt)
{
put_net(xprt->xprt_net);
xprt_free_all_slots(xprt);
- kfree(xprt);
+ kfree_rcu(xprt, rcu);
}
EXPORT_SYMBOL_GPL(xprt_free);
@@ -1180,7 +1181,7 @@ EXPORT_SYMBOL_GPL(xprt_free);
*/
void xprt_reserve(struct rpc_task *task)
{
- struct rpc_xprt *xprt;
+ struct rpc_xprt *xprt = task->tk_xprt;
task->tk_status = 0;
if (task->tk_rqstp != NULL)
@@ -1188,11 +1189,8 @@ void xprt_reserve(struct rpc_task *task)
task->tk_timeout = 0;
task->tk_status = -EAGAIN;
- rcu_read_lock();
- xprt = rcu_dereference(task->tk_client->cl_xprt);
if (!xprt_throttle_congested(xprt, task))
xprt->ops->alloc_slot(xprt, task);
- rcu_read_unlock();
}
/**
@@ -1206,7 +1204,7 @@ void xprt_reserve(struct rpc_task *task)
*/
void xprt_retry_reserve(struct rpc_task *task)
{
- struct rpc_xprt *xprt;
+ struct rpc_xprt *xprt = task->tk_xprt;
task->tk_status = 0;
if (task->tk_rqstp != NULL)
@@ -1214,10 +1212,7 @@ void xprt_retry_reserve(struct rpc_task *task)
task->tk_timeout = 0;
task->tk_status = -EAGAIN;
- rcu_read_lock();
- xprt = rcu_dereference(task->tk_client->cl_xprt);
xprt->ops->alloc_slot(xprt, task);
- rcu_read_unlock();
}
static inline __be32 xprt_alloc_xid(struct rpc_xprt *xprt)
@@ -1264,11 +1259,9 @@ void xprt_release(struct rpc_task *task)
if (req == NULL) {
if (task->tk_client) {
- rcu_read_lock();
- xprt = rcu_dereference(task->tk_client->cl_xprt);
+ xprt = task->tk_xprt;
if (xprt->snd_task == task)
xprt_release_write(xprt, task);
- rcu_read_unlock();
}
return;
}
@@ -1307,7 +1300,7 @@ void xprt_release(struct rpc_task *task)
static void xprt_init(struct rpc_xprt *xprt, struct net *net)
{
- atomic_set(&xprt->count, 1);
+ kref_init(&xprt->kref);
spin_lock_init(&xprt->transport_lock);
spin_lock_init(&xprt->reserve_lock);
@@ -1318,6 +1311,7 @@ static void xprt_init(struct rpc_xprt *xprt, struct net *net)
spin_lock_init(&xprt->bc_pa_lock);
INIT_LIST_HEAD(&xprt->bc_pa_list);
#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+ INIT_LIST_HEAD(&xprt->xprt_switch);
xprt->last_used = jiffies;
xprt->cwnd = RPC_INITCWND;
@@ -1415,6 +1409,24 @@ static void xprt_destroy(struct rpc_xprt *xprt)
xprt->ops->destroy(xprt);
}
+static void xprt_destroy_kref(struct kref *kref)
+{
+ xprt_destroy(container_of(kref, struct rpc_xprt, kref));
+}
+
+/**
+ * xprt_get - return a reference to an RPC transport.
+ * @xprt: pointer to the transport
+ *
+ */
+struct rpc_xprt *xprt_get(struct rpc_xprt *xprt)
+{
+ if (xprt != NULL && kref_get_unless_zero(&xprt->kref))
+ return xprt;
+ return NULL;
+}
+EXPORT_SYMBOL_GPL(xprt_get);
+
/**
* xprt_put - release a reference to an RPC transport.
* @xprt: pointer to the transport
@@ -1422,6 +1434,7 @@ static void xprt_destroy(struct rpc_xprt *xprt)
*/
void xprt_put(struct rpc_xprt *xprt)
{
- if (atomic_dec_and_test(&xprt->count))
- xprt_destroy(xprt);
+ if (xprt != NULL)
+ kref_put(&xprt->kref, xprt_destroy_kref);
}
+EXPORT_SYMBOL_GPL(xprt_put);
diff --git a/net/sunrpc/xprtmultipath.c b/net/sunrpc/xprtmultipath.c
new file mode 100644
index 000000000000..e7fd76975d86
--- /dev/null
+++ b/net/sunrpc/xprtmultipath.c
@@ -0,0 +1,475 @@
+/*
+ * Multipath support for RPC
+ *
+ * Copyright (c) 2015, 2016, Primary Data, Inc. All rights reserved.
+ *
+ * Trond Myklebust <trond.myklebust@primarydata.com>
+ *
+ */
+#include <linux/types.h>
+#include <linux/kref.h>
+#include <linux/list.h>
+#include <linux/rcupdate.h>
+#include <linux/rculist.h>
+#include <linux/slab.h>
+#include <asm/cmpxchg.h>
+#include <linux/spinlock.h>
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/xprtmultipath.h>
+
+typedef struct rpc_xprt *(*xprt_switch_find_xprt_t)(struct list_head *head,
+ const struct rpc_xprt *cur);
+
+static const struct rpc_xprt_iter_ops rpc_xprt_iter_singular;
+static const struct rpc_xprt_iter_ops rpc_xprt_iter_roundrobin;
+static const struct rpc_xprt_iter_ops rpc_xprt_iter_listall;
+
+static void xprt_switch_add_xprt_locked(struct rpc_xprt_switch *xps,
+ struct rpc_xprt *xprt)
+{
+ if (unlikely(xprt_get(xprt) == NULL))
+ return;
+ list_add_tail_rcu(&xprt->xprt_switch, &xps->xps_xprt_list);
+ smp_wmb();
+ if (xps->xps_nxprts == 0)
+ xps->xps_net = xprt->xprt_net;
+ xps->xps_nxprts++;
+}
+
+/**
+ * rpc_xprt_switch_add_xprt - Add a new rpc_xprt to an rpc_xprt_switch
+ * @xps: pointer to struct rpc_xprt_switch
+ * @xprt: pointer to struct rpc_xprt
+ *
+ * Adds xprt to the end of the list of struct rpc_xprt in xps.
+ */
+void rpc_xprt_switch_add_xprt(struct rpc_xprt_switch *xps,
+ struct rpc_xprt *xprt)
+{
+ if (xprt == NULL)
+ return;
+ spin_lock(&xps->xps_lock);
+ if (xps->xps_net == xprt->xprt_net || xps->xps_net == NULL)
+ xprt_switch_add_xprt_locked(xps, xprt);
+ spin_unlock(&xps->xps_lock);
+}
+
+static void xprt_switch_remove_xprt_locked(struct rpc_xprt_switch *xps,
+ struct rpc_xprt *xprt)
+{
+ if (unlikely(xprt == NULL))
+ return;
+ xps->xps_nxprts--;
+ if (xps->xps_nxprts == 0)
+ xps->xps_net = NULL;
+ smp_wmb();
+ list_del_rcu(&xprt->xprt_switch);
+}
+
+/**
+ * rpc_xprt_switch_remove_xprt - Removes an rpc_xprt from a rpc_xprt_switch
+ * @xps: pointer to struct rpc_xprt_switch
+ * @xprt: pointer to struct rpc_xprt
+ *
+ * Removes xprt from the list of struct rpc_xprt in xps.
+ */
+void rpc_xprt_switch_remove_xprt(struct rpc_xprt_switch *xps,
+ struct rpc_xprt *xprt)
+{
+ spin_lock(&xps->xps_lock);
+ xprt_switch_remove_xprt_locked(xps, xprt);
+ spin_unlock(&xps->xps_lock);
+ xprt_put(xprt);
+}
+
+/**
+ * xprt_switch_alloc - Allocate a new struct rpc_xprt_switch
+ * @xprt: pointer to struct rpc_xprt
+ * @gfp_flags: allocation flags
+ *
+ * On success, returns an initialised struct rpc_xprt_switch, containing
+ * the entry xprt. Returns NULL on failure.
+ */
+struct rpc_xprt_switch *xprt_switch_alloc(struct rpc_xprt *xprt,
+ gfp_t gfp_flags)
+{
+ struct rpc_xprt_switch *xps;
+
+ xps = kmalloc(sizeof(*xps), gfp_flags);
+ if (xps != NULL) {
+ spin_lock_init(&xps->xps_lock);
+ kref_init(&xps->xps_kref);
+ xps->xps_nxprts = 0;
+ INIT_LIST_HEAD(&xps->xps_xprt_list);
+ xps->xps_iter_ops = &rpc_xprt_iter_singular;
+ xprt_switch_add_xprt_locked(xps, xprt);
+ }
+
+ return xps;
+}
+
+static void xprt_switch_free_entries(struct rpc_xprt_switch *xps)
+{
+ spin_lock(&xps->xps_lock);
+ while (!list_empty(&xps->xps_xprt_list)) {
+ struct rpc_xprt *xprt;
+
+ xprt = list_first_entry(&xps->xps_xprt_list,
+ struct rpc_xprt, xprt_switch);
+ xprt_switch_remove_xprt_locked(xps, xprt);
+ spin_unlock(&xps->xps_lock);
+ xprt_put(xprt);
+ spin_lock(&xps->xps_lock);
+ }
+ spin_unlock(&xps->xps_lock);
+}
+
+static void xprt_switch_free(struct kref *kref)
+{
+ struct rpc_xprt_switch *xps = container_of(kref,
+ struct rpc_xprt_switch, xps_kref);
+
+ xprt_switch_free_entries(xps);
+ kfree_rcu(xps, xps_rcu);
+}
+
+/**
+ * xprt_switch_get - Return a reference to a rpc_xprt_switch
+ * @xps: pointer to struct rpc_xprt_switch
+ *
+ * Returns a reference to xps unless the refcount is already zero.
+ */
+struct rpc_xprt_switch *xprt_switch_get(struct rpc_xprt_switch *xps)
+{
+ if (xps != NULL && kref_get_unless_zero(&xps->xps_kref))
+ return xps;
+ return NULL;
+}
+
+/**
+ * xprt_switch_put - Release a reference to a rpc_xprt_switch
+ * @xps: pointer to struct rpc_xprt_switch
+ *
+ * Release the reference to xps, and free it once the refcount is zero.
+ */
+void xprt_switch_put(struct rpc_xprt_switch *xps)
+{
+ if (xps != NULL)
+ kref_put(&xps->xps_kref, xprt_switch_free);
+}
+
+/**
+ * rpc_xprt_switch_set_roundrobin - Set a round-robin policy on rpc_xprt_switch
+ * @xps: pointer to struct rpc_xprt_switch
+ *
+ * Sets a round-robin default policy for iterators acting on xps.
+ */
+void rpc_xprt_switch_set_roundrobin(struct rpc_xprt_switch *xps)
+{
+ if (READ_ONCE(xps->xps_iter_ops) != &rpc_xprt_iter_roundrobin)
+ WRITE_ONCE(xps->xps_iter_ops, &rpc_xprt_iter_roundrobin);
+}
+
+static
+const struct rpc_xprt_iter_ops *xprt_iter_ops(const struct rpc_xprt_iter *xpi)
+{
+ if (xpi->xpi_ops != NULL)
+ return xpi->xpi_ops;
+ return rcu_dereference(xpi->xpi_xpswitch)->xps_iter_ops;
+}
+
+static
+void xprt_iter_no_rewind(struct rpc_xprt_iter *xpi)
+{
+}
+
+static
+void xprt_iter_default_rewind(struct rpc_xprt_iter *xpi)
+{
+ WRITE_ONCE(xpi->xpi_cursor, NULL);
+}
+
+static
+struct rpc_xprt *xprt_switch_find_first_entry(struct list_head *head)
+{
+ return list_first_or_null_rcu(head, struct rpc_xprt, xprt_switch);
+}
+
+static
+struct rpc_xprt *xprt_iter_first_entry(struct rpc_xprt_iter *xpi)
+{
+ struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
+
+ if (xps == NULL)
+ return NULL;
+ return xprt_switch_find_first_entry(&xps->xps_xprt_list);
+}
+
+static
+struct rpc_xprt *xprt_switch_find_current_entry(struct list_head *head,
+ const struct rpc_xprt *cur)
+{
+ struct rpc_xprt *pos;
+
+ list_for_each_entry_rcu(pos, head, xprt_switch) {
+ if (cur == pos)
+ return pos;
+ }
+ return NULL;
+}
+
+static
+struct rpc_xprt *xprt_iter_current_entry(struct rpc_xprt_iter *xpi)
+{
+ struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
+ struct list_head *head;
+
+ if (xps == NULL)
+ return NULL;
+ head = &xps->xps_xprt_list;
+ if (xpi->xpi_cursor == NULL || xps->xps_nxprts < 2)
+ return xprt_switch_find_first_entry(head);
+ return xprt_switch_find_current_entry(head, xpi->xpi_cursor);
+}
+
+static
+struct rpc_xprt *xprt_switch_find_next_entry(struct list_head *head,
+ const struct rpc_xprt *cur)
+{
+ struct rpc_xprt *pos, *prev = NULL;
+
+ list_for_each_entry_rcu(pos, head, xprt_switch) {
+ if (cur == prev)
+ return pos;
+ prev = pos;
+ }
+ return NULL;
+}
+
+static
+struct rpc_xprt *xprt_switch_set_next_cursor(struct list_head *head,
+ struct rpc_xprt **cursor,
+ xprt_switch_find_xprt_t find_next)
+{
+ struct rpc_xprt *cur, *pos, *old;
+
+ cur = READ_ONCE(*cursor);
+ for (;;) {
+ old = cur;
+ pos = find_next(head, old);
+ if (pos == NULL)
+ break;
+ cur = cmpxchg_relaxed(cursor, old, pos);
+ if (cur == old)
+ break;
+ }
+ return pos;
+}
+
+static
+struct rpc_xprt *xprt_iter_next_entry_multiple(struct rpc_xprt_iter *xpi,
+ xprt_switch_find_xprt_t find_next)
+{
+ struct rpc_xprt_switch *xps = rcu_dereference(xpi->xpi_xpswitch);
+ struct list_head *head;
+
+ if (xps == NULL)
+ return NULL;
+ head = &xps->xps_xprt_list;
+ if (xps->xps_nxprts < 2)
+ return xprt_switch_find_first_entry(head);
+ return xprt_switch_set_next_cursor(head, &xpi->xpi_cursor, find_next);
+}
+
+static
+struct rpc_xprt *xprt_switch_find_next_entry_roundrobin(struct list_head *head,
+ const struct rpc_xprt *cur)
+{
+ struct rpc_xprt *ret;
+
+ ret = xprt_switch_find_next_entry(head, cur);
+ if (ret != NULL)
+ return ret;
+ return xprt_switch_find_first_entry(head);
+}
+
+static
+struct rpc_xprt *xprt_iter_next_entry_roundrobin(struct rpc_xprt_iter *xpi)
+{
+ return xprt_iter_next_entry_multiple(xpi,
+ xprt_switch_find_next_entry_roundrobin);
+}
+
+static
+struct rpc_xprt *xprt_iter_next_entry_all(struct rpc_xprt_iter *xpi)
+{
+ return xprt_iter_next_entry_multiple(xpi, xprt_switch_find_next_entry);
+}
+
+/*
+ * xprt_iter_rewind - Resets the xprt iterator
+ * @xpi: pointer to rpc_xprt_iter
+ *
+ * Resets xpi to ensure that it points to the first entry in the list
+ * of transports.
+ */
+static
+void xprt_iter_rewind(struct rpc_xprt_iter *xpi)
+{
+ rcu_read_lock();
+ xprt_iter_ops(xpi)->xpi_rewind(xpi);
+ rcu_read_unlock();
+}
+
+static void __xprt_iter_init(struct rpc_xprt_iter *xpi,
+ struct rpc_xprt_switch *xps,
+ const struct rpc_xprt_iter_ops *ops)
+{
+ rcu_assign_pointer(xpi->xpi_xpswitch, xprt_switch_get(xps));
+ xpi->xpi_cursor = NULL;
+ xpi->xpi_ops = ops;
+}
+
+/**
+ * xprt_iter_init - Initialise an xprt iterator
+ * @xpi: pointer to rpc_xprt_iter
+ * @xps: pointer to rpc_xprt_switch
+ *
+ * Initialises the iterator to use the default iterator ops
+ * as set in xps. This function is mainly intended for internal
+ * use in the rpc_client.
+ */
+void xprt_iter_init(struct rpc_xprt_iter *xpi,
+ struct rpc_xprt_switch *xps)
+{
+ __xprt_iter_init(xpi, xps, NULL);
+}
+
+/**
+ * xprt_iter_init_listall - Initialise an xprt iterator
+ * @xpi: pointer to rpc_xprt_iter
+ * @xps: pointer to rpc_xprt_switch
+ *
+ * Initialises the iterator to iterate once through the entire list
+ * of entries in xps.
+ */
+void xprt_iter_init_listall(struct rpc_xprt_iter *xpi,
+ struct rpc_xprt_switch *xps)
+{
+ __xprt_iter_init(xpi, xps, &rpc_xprt_iter_listall);
+}
+
+/**
+ * xprt_iter_xchg_switch - Atomically swap out the rpc_xprt_switch
+ * @xpi: pointer to rpc_xprt_iter
+ * @xps: pointer to a new rpc_xprt_switch or NULL
+ *
+ * Swaps out the existing xpi->xpi_xpswitch with a new value.
+ */
+struct rpc_xprt_switch *xprt_iter_xchg_switch(struct rpc_xprt_iter *xpi,
+ struct rpc_xprt_switch *newswitch)
+{
+ struct rpc_xprt_switch __rcu *oldswitch;
+
+ /* Atomically swap out the old xpswitch */
+ oldswitch = xchg(&xpi->xpi_xpswitch, RCU_INITIALIZER(newswitch));
+ if (newswitch != NULL)
+ xprt_iter_rewind(xpi);
+ return rcu_dereference_protected(oldswitch, true);
+}
+
+/**
+ * xprt_iter_destroy - Destroys the xprt iterator
+ * @xpi pointer to rpc_xprt_iter
+ */
+void xprt_iter_destroy(struct rpc_xprt_iter *xpi)
+{
+ xprt_switch_put(xprt_iter_xchg_switch(xpi, NULL));
+}
+
+/**
+ * xprt_iter_xprt - Returns the rpc_xprt pointed to by the cursor
+ * @xpi: pointer to rpc_xprt_iter
+ *
+ * Returns a pointer to the struct rpc_xprt that is currently
+ * pointed to by the cursor.
+ * Caller must be holding rcu_read_lock().
+ */
+struct rpc_xprt *xprt_iter_xprt(struct rpc_xprt_iter *xpi)
+{
+ WARN_ON_ONCE(!rcu_read_lock_held());
+ return xprt_iter_ops(xpi)->xpi_xprt(xpi);
+}
+
+static
+struct rpc_xprt *xprt_iter_get_helper(struct rpc_xprt_iter *xpi,
+ struct rpc_xprt *(*fn)(struct rpc_xprt_iter *))
+{
+ struct rpc_xprt *ret;
+
+ do {
+ ret = fn(xpi);
+ if (ret == NULL)
+ break;
+ ret = xprt_get(ret);
+ } while (ret == NULL);
+ return ret;
+}
+
+/**
+ * xprt_iter_get_xprt - Returns the rpc_xprt pointed to by the cursor
+ * @xpi: pointer to rpc_xprt_iter
+ *
+ * Returns a reference to the struct rpc_xprt that is currently
+ * pointed to by the cursor.
+ */
+struct rpc_xprt *xprt_iter_get_xprt(struct rpc_xprt_iter *xpi)
+{
+ struct rpc_xprt *xprt;
+
+ rcu_read_lock();
+ xprt = xprt_iter_get_helper(xpi, xprt_iter_ops(xpi)->xpi_xprt);
+ rcu_read_unlock();
+ return xprt;
+}
+
+/**
+ * xprt_iter_get_next - Returns the next rpc_xprt following the cursor
+ * @xpi: pointer to rpc_xprt_iter
+ *
+ * Returns a reference to the struct rpc_xprt that immediately follows the
+ * entry pointed to by the cursor.
+ */
+struct rpc_xprt *xprt_iter_get_next(struct rpc_xprt_iter *xpi)
+{
+ struct rpc_xprt *xprt;
+
+ rcu_read_lock();
+ xprt = xprt_iter_get_helper(xpi, xprt_iter_ops(xpi)->xpi_next);
+ rcu_read_unlock();
+ return xprt;
+}
+
+/* Policy for always returning the first entry in the rpc_xprt_switch */
+static
+const struct rpc_xprt_iter_ops rpc_xprt_iter_singular = {
+ .xpi_rewind = xprt_iter_no_rewind,
+ .xpi_xprt = xprt_iter_first_entry,
+ .xpi_next = xprt_iter_first_entry,
+};
+
+/* Policy for round-robin iteration of entries in the rpc_xprt_switch */
+static
+const struct rpc_xprt_iter_ops rpc_xprt_iter_roundrobin = {
+ .xpi_rewind = xprt_iter_default_rewind,
+ .xpi_xprt = xprt_iter_current_entry,
+ .xpi_next = xprt_iter_next_entry_roundrobin,
+};
+
+/* Policy for once-through iteration of entries in the rpc_xprt_switch */
+static
+const struct rpc_xprt_iter_ops rpc_xprt_iter_listall = {
+ .xpi_rewind = xprt_iter_default_rewind,
+ .xpi_xprt = xprt_iter_current_entry,
+ .xpi_next = xprt_iter_next_entry_all,
+};
diff --git a/net/sunrpc/xprtrdma/Makefile b/net/sunrpc/xprtrdma/Makefile
index 48913de240bd..dc9f3b513a05 100644
--- a/net/sunrpc/xprtrdma/Makefile
+++ b/net/sunrpc/xprtrdma/Makefile
@@ -2,6 +2,7 @@ obj-$(CONFIG_SUNRPC_XPRT_RDMA) += rpcrdma.o
rpcrdma-y := transport.o rpc_rdma.o verbs.o \
fmr_ops.o frwr_ops.o physical_ops.o \
- svc_rdma.o svc_rdma_transport.o \
+ svc_rdma.o svc_rdma_backchannel.o svc_rdma_transport.o \
svc_rdma_marshal.o svc_rdma_sendto.o svc_rdma_recvfrom.o \
module.o
+rpcrdma-$(CONFIG_SUNRPC_BACKCHANNEL) += backchannel.o
diff --git a/net/sunrpc/xprtrdma/backchannel.c b/net/sunrpc/xprtrdma/backchannel.c
new file mode 100644
index 000000000000..2dcd7640eeb5
--- /dev/null
+++ b/net/sunrpc/xprtrdma/backchannel.c
@@ -0,0 +1,394 @@
+/*
+ * Copyright (c) 2015 Oracle. All rights reserved.
+ *
+ * Support for backward direction RPCs on RPC/RDMA.
+ */
+
+#include <linux/module.h>
+#include <linux/sunrpc/xprt.h>
+#include <linux/sunrpc/svc.h>
+#include <linux/sunrpc/svc_xprt.h>
+
+#include "xprt_rdma.h"
+
+#if IS_ENABLED(CONFIG_SUNRPC_DEBUG)
+# define RPCDBG_FACILITY RPCDBG_TRANS
+#endif
+
+#undef RPCRDMA_BACKCHANNEL_DEBUG
+
+static void rpcrdma_bc_free_rqst(struct rpcrdma_xprt *r_xprt,
+ struct rpc_rqst *rqst)
+{
+ struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+
+ spin_lock(&buf->rb_reqslock);
+ list_del(&req->rl_all);
+ spin_unlock(&buf->rb_reqslock);
+
+ rpcrdma_destroy_req(&r_xprt->rx_ia, req);
+
+ kfree(rqst);
+}
+
+static int rpcrdma_bc_setup_rqst(struct rpcrdma_xprt *r_xprt,
+ struct rpc_rqst *rqst)
+{
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_regbuf *rb;
+ struct rpcrdma_req *req;
+ struct xdr_buf *buf;
+ size_t size;
+
+ req = rpcrdma_create_req(r_xprt);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ req->rl_backchannel = true;
+
+ size = RPCRDMA_INLINE_WRITE_THRESHOLD(rqst);
+ rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+ if (IS_ERR(rb))
+ goto out_fail;
+ req->rl_rdmabuf = rb;
+
+ size += RPCRDMA_INLINE_READ_THRESHOLD(rqst);
+ rb = rpcrdma_alloc_regbuf(ia, size, GFP_KERNEL);
+ if (IS_ERR(rb))
+ goto out_fail;
+ rb->rg_owner = req;
+ req->rl_sendbuf = rb;
+ /* so that rpcr_to_rdmar works when receiving a request */
+ rqst->rq_buffer = (void *)req->rl_sendbuf->rg_base;
+
+ buf = &rqst->rq_snd_buf;
+ buf->head[0].iov_base = rqst->rq_buffer;
+ buf->head[0].iov_len = 0;
+ buf->tail[0].iov_base = NULL;
+ buf->tail[0].iov_len = 0;
+ buf->page_len = 0;
+ buf->len = 0;
+ buf->buflen = size;
+
+ return 0;
+
+out_fail:
+ rpcrdma_bc_free_rqst(r_xprt, rqst);
+ return -ENOMEM;
+}
+
+/* Allocate and add receive buffers to the rpcrdma_buffer's
+ * existing list of rep's. These are released when the
+ * transport is destroyed.
+ */
+static int rpcrdma_bc_setup_reps(struct rpcrdma_xprt *r_xprt,
+ unsigned int count)
+{
+ struct rpcrdma_rep *rep;
+ int rc = 0;
+
+ while (count--) {
+ rep = rpcrdma_create_rep(r_xprt);
+ if (IS_ERR(rep)) {
+ pr_err("RPC: %s: reply buffer alloc failed\n",
+ __func__);
+ rc = PTR_ERR(rep);
+ break;
+ }
+
+ rpcrdma_recv_buffer_put(rep);
+ }
+
+ return rc;
+}
+
+/**
+ * xprt_rdma_bc_setup - Pre-allocate resources for handling backchannel requests
+ * @xprt: transport associated with these backchannel resources
+ * @reqs: number of concurrent incoming requests to expect
+ *
+ * Returns 0 on success; otherwise a negative errno
+ */
+int xprt_rdma_bc_setup(struct rpc_xprt *xprt, unsigned int reqs)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
+ struct rpc_rqst *rqst;
+ unsigned int i;
+ int rc;
+
+ /* The backchannel reply path returns each rpc_rqst to the
+ * bc_pa_list _after_ the reply is sent. If the server is
+ * faster than the client, it can send another backward
+ * direction request before the rpc_rqst is returned to the
+ * list. The client rejects the request in this case.
+ *
+ * Twice as many rpc_rqsts are prepared to ensure there is
+ * always an rpc_rqst available as soon as a reply is sent.
+ */
+ if (reqs > RPCRDMA_BACKWARD_WRS >> 1)
+ goto out_err;
+
+ for (i = 0; i < (reqs << 1); i++) {
+ rqst = kzalloc(sizeof(*rqst), GFP_KERNEL);
+ if (!rqst) {
+ pr_err("RPC: %s: Failed to create bc rpc_rqst\n",
+ __func__);
+ goto out_free;
+ }
+ dprintk("RPC: %s: new rqst %p\n", __func__, rqst);
+
+ rqst->rq_xprt = &r_xprt->rx_xprt;
+ INIT_LIST_HEAD(&rqst->rq_list);
+ INIT_LIST_HEAD(&rqst->rq_bc_list);
+
+ if (rpcrdma_bc_setup_rqst(r_xprt, rqst))
+ goto out_free;
+
+ spin_lock_bh(&xprt->bc_pa_lock);
+ list_add(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
+ spin_unlock_bh(&xprt->bc_pa_lock);
+ }
+
+ rc = rpcrdma_bc_setup_reps(r_xprt, reqs);
+ if (rc)
+ goto out_free;
+
+ rc = rpcrdma_ep_post_extra_recv(r_xprt, reqs);
+ if (rc)
+ goto out_free;
+
+ buffer->rb_bc_srv_max_requests = reqs;
+ request_module("svcrdma");
+
+ return 0;
+
+out_free:
+ xprt_rdma_bc_destroy(xprt, reqs);
+
+out_err:
+ pr_err("RPC: %s: setup backchannel transport failed\n", __func__);
+ return -ENOMEM;
+}
+
+/**
+ * xprt_rdma_bc_up - Create transport endpoint for backchannel service
+ * @serv: server endpoint
+ * @net: network namespace
+ *
+ * The "xprt" is an implied argument: it supplies the name of the
+ * backchannel transport class.
+ *
+ * Returns zero on success, negative errno on failure
+ */
+int xprt_rdma_bc_up(struct svc_serv *serv, struct net *net)
+{
+ int ret;
+
+ ret = svc_create_xprt(serv, "rdma-bc", net, PF_INET, 0, 0);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+/**
+ * rpcrdma_bc_marshal_reply - Send backwards direction reply
+ * @rqst: buffer containing RPC reply data
+ *
+ * Returns zero on success.
+ */
+int rpcrdma_bc_marshal_reply(struct rpc_rqst *rqst)
+{
+ struct rpc_xprt *xprt = rqst->rq_xprt;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_req *req = rpcr_to_rdmar(rqst);
+ struct rpcrdma_msg *headerp;
+ size_t rpclen;
+
+ headerp = rdmab_to_msg(req->rl_rdmabuf);
+ headerp->rm_xid = rqst->rq_xid;
+ headerp->rm_vers = rpcrdma_version;
+ headerp->rm_credit =
+ cpu_to_be32(r_xprt->rx_buf.rb_bc_srv_max_requests);
+ headerp->rm_type = rdma_msg;
+ headerp->rm_body.rm_chunks[0] = xdr_zero;
+ headerp->rm_body.rm_chunks[1] = xdr_zero;
+ headerp->rm_body.rm_chunks[2] = xdr_zero;
+
+ rpclen = rqst->rq_svec[0].iov_len;
+
+#ifdef RPCRDMA_BACKCHANNEL_DEBUG
+ pr_info("RPC: %s: rpclen %zd headerp 0x%p lkey 0x%x\n",
+ __func__, rpclen, headerp, rdmab_lkey(req->rl_rdmabuf));
+ pr_info("RPC: %s: RPC/RDMA: %*ph\n",
+ __func__, (int)RPCRDMA_HDRLEN_MIN, headerp);
+ pr_info("RPC: %s: RPC: %*ph\n",
+ __func__, (int)rpclen, rqst->rq_svec[0].iov_base);
+#endif
+
+ req->rl_send_iov[0].addr = rdmab_addr(req->rl_rdmabuf);
+ req->rl_send_iov[0].length = RPCRDMA_HDRLEN_MIN;
+ req->rl_send_iov[0].lkey = rdmab_lkey(req->rl_rdmabuf);
+
+ req->rl_send_iov[1].addr = rdmab_addr(req->rl_sendbuf);
+ req->rl_send_iov[1].length = rpclen;
+ req->rl_send_iov[1].lkey = rdmab_lkey(req->rl_sendbuf);
+
+ req->rl_niovs = 2;
+ return 0;
+}
+
+/**
+ * xprt_rdma_bc_destroy - Release resources for handling backchannel requests
+ * @xprt: transport associated with these backchannel resources
+ * @reqs: number of incoming requests to destroy; ignored
+ */
+void xprt_rdma_bc_destroy(struct rpc_xprt *xprt, unsigned int reqs)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpc_rqst *rqst, *tmp;
+
+ spin_lock_bh(&xprt->bc_pa_lock);
+ list_for_each_entry_safe(rqst, tmp, &xprt->bc_pa_list, rq_bc_pa_list) {
+ list_del(&rqst->rq_bc_pa_list);
+ spin_unlock_bh(&xprt->bc_pa_lock);
+
+ rpcrdma_bc_free_rqst(r_xprt, rqst);
+
+ spin_lock_bh(&xprt->bc_pa_lock);
+ }
+ spin_unlock_bh(&xprt->bc_pa_lock);
+}
+
+/**
+ * xprt_rdma_bc_free_rqst - Release a backchannel rqst
+ * @rqst: request to release
+ */
+void xprt_rdma_bc_free_rqst(struct rpc_rqst *rqst)
+{
+ struct rpc_xprt *xprt = rqst->rq_xprt;
+
+ dprintk("RPC: %s: freeing rqst %p (req %p)\n",
+ __func__, rqst, rpcr_to_rdmar(rqst));
+
+ smp_mb__before_atomic();
+ WARN_ON_ONCE(!test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state));
+ clear_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
+ smp_mb__after_atomic();
+
+ spin_lock_bh(&xprt->bc_pa_lock);
+ list_add_tail(&rqst->rq_bc_pa_list, &xprt->bc_pa_list);
+ spin_unlock_bh(&xprt->bc_pa_lock);
+}
+
+/**
+ * rpcrdma_bc_receive_call - Handle a backward direction call
+ * @xprt: transport receiving the call
+ * @rep: receive buffer containing the call
+ *
+ * Called in the RPC reply handler, which runs in a tasklet.
+ * Be quick about it.
+ *
+ * Operational assumptions:
+ * o Backchannel credits are ignored, just as the NFS server
+ * forechannel currently does
+ * o The ULP manages a replay cache (eg, NFSv4.1 sessions).
+ * No replay detection is done at the transport level
+ */
+void rpcrdma_bc_receive_call(struct rpcrdma_xprt *r_xprt,
+ struct rpcrdma_rep *rep)
+{
+ struct rpc_xprt *xprt = &r_xprt->rx_xprt;
+ struct rpcrdma_msg *headerp;
+ struct svc_serv *bc_serv;
+ struct rpcrdma_req *req;
+ struct rpc_rqst *rqst;
+ struct xdr_buf *buf;
+ size_t size;
+ __be32 *p;
+
+ headerp = rdmab_to_msg(rep->rr_rdmabuf);
+#ifdef RPCRDMA_BACKCHANNEL_DEBUG
+ pr_info("RPC: %s: callback XID %08x, length=%u\n",
+ __func__, be32_to_cpu(headerp->rm_xid), rep->rr_len);
+ pr_info("RPC: %s: %*ph\n", __func__, rep->rr_len, headerp);
+#endif
+
+ /* Sanity check:
+ * Need at least enough bytes for RPC/RDMA header, as code
+ * here references the header fields by array offset. Also,
+ * backward calls are always inline, so ensure there
+ * are some bytes beyond the RPC/RDMA header.
+ */
+ if (rep->rr_len < RPCRDMA_HDRLEN_MIN + 24)
+ goto out_short;
+ p = (__be32 *)((unsigned char *)headerp + RPCRDMA_HDRLEN_MIN);
+ size = rep->rr_len - RPCRDMA_HDRLEN_MIN;
+
+ /* Grab a free bc rqst */
+ spin_lock(&xprt->bc_pa_lock);
+ if (list_empty(&xprt->bc_pa_list)) {
+ spin_unlock(&xprt->bc_pa_lock);
+ goto out_overflow;
+ }
+ rqst = list_first_entry(&xprt->bc_pa_list,
+ struct rpc_rqst, rq_bc_pa_list);
+ list_del(&rqst->rq_bc_pa_list);
+ spin_unlock(&xprt->bc_pa_lock);
+ dprintk("RPC: %s: using rqst %p\n", __func__, rqst);
+
+ /* Prepare rqst */
+ rqst->rq_reply_bytes_recvd = 0;
+ rqst->rq_bytes_sent = 0;
+ rqst->rq_xid = headerp->rm_xid;
+
+ rqst->rq_private_buf.len = size;
+ set_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state);
+
+ buf = &rqst->rq_rcv_buf;
+ memset(buf, 0, sizeof(*buf));
+ buf->head[0].iov_base = p;
+ buf->head[0].iov_len = size;
+ buf->len = size;
+
+ /* The receive buffer has to be hooked to the rpcrdma_req
+ * so that it can be reposted after the server is done
+ * parsing it but just before sending the backward
+ * direction reply.
+ */
+ req = rpcr_to_rdmar(rqst);
+ dprintk("RPC: %s: attaching rep %p to req %p\n",
+ __func__, rep, req);
+ req->rl_reply = rep;
+
+ /* Defeat the retransmit detection logic in send_request */
+ req->rl_connect_cookie = 0;
+
+ /* Queue rqst for ULP's callback service */
+ bc_serv = xprt->bc_serv;
+ spin_lock(&bc_serv->sv_cb_lock);
+ list_add(&rqst->rq_bc_list, &bc_serv->sv_cb_list);
+ spin_unlock(&bc_serv->sv_cb_lock);
+
+ wake_up(&bc_serv->sv_cb_waitq);
+
+ r_xprt->rx_stats.bcall_count++;
+ return;
+
+out_overflow:
+ pr_warn("RPC/RDMA backchannel overflow\n");
+ xprt_disconnect_done(xprt);
+ /* This receive buffer gets reposted automatically
+ * when the connection is re-established.
+ */
+ return;
+
+out_short:
+ pr_warn("RPC/RDMA short backward direction call\n");
+
+ if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+ xprt_disconnect_done(xprt);
+ else
+ pr_warn("RPC: %s: reposting rep %p\n",
+ __func__, rep);
+}
diff --git a/net/sunrpc/xprtrdma/fmr_ops.c b/net/sunrpc/xprtrdma/fmr_ops.c
index f1e8dafbd507..b289e106540b 100644
--- a/net/sunrpc/xprtrdma/fmr_ops.c
+++ b/net/sunrpc/xprtrdma/fmr_ops.c
@@ -80,13 +80,13 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
if (!r)
goto out;
- r->r.fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
- sizeof(u64), GFP_KERNEL);
- if (!r->r.fmr.physaddrs)
+ r->fmr.physaddrs = kmalloc(RPCRDMA_MAX_FMR_SGES *
+ sizeof(u64), GFP_KERNEL);
+ if (!r->fmr.physaddrs)
goto out_free;
- r->r.fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
- if (IS_ERR(r->r.fmr.fmr))
+ r->fmr.fmr = ib_alloc_fmr(pd, mr_access_flags, &fmr_attr);
+ if (IS_ERR(r->fmr.fmr))
goto out_fmr_err;
list_add(&r->mw_list, &buf->rb_mws);
@@ -95,9 +95,9 @@ fmr_op_init(struct rpcrdma_xprt *r_xprt)
return 0;
out_fmr_err:
- rc = PTR_ERR(r->r.fmr.fmr);
+ rc = PTR_ERR(r->fmr.fmr);
dprintk("RPC: %s: ib_alloc_fmr status %i\n", __func__, rc);
- kfree(r->r.fmr.physaddrs);
+ kfree(r->fmr.physaddrs);
out_free:
kfree(r);
out:
@@ -109,7 +109,7 @@ __fmr_unmap(struct rpcrdma_mw *r)
{
LIST_HEAD(l);
- list_add(&r->r.fmr.fmr->list, &l);
+ list_add(&r->fmr.fmr->list, &l);
return ib_unmap_fmr(&l);
}
@@ -148,7 +148,7 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
nsegs = RPCRDMA_MAX_FMR_SGES;
for (i = 0; i < nsegs;) {
rpcrdma_map_one(device, seg, direction);
- mw->r.fmr.physaddrs[i] = seg->mr_dma;
+ mw->fmr.physaddrs[i] = seg->mr_dma;
len += seg->mr_len;
++seg;
++i;
@@ -158,13 +158,13 @@ fmr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
break;
}
- rc = ib_map_phys_fmr(mw->r.fmr.fmr, mw->r.fmr.physaddrs,
+ rc = ib_map_phys_fmr(mw->fmr.fmr, mw->fmr.physaddrs,
i, seg1->mr_dma);
if (rc)
goto out_maperr;
seg1->rl_mw = mw;
- seg1->mr_rkey = mw->r.fmr.fmr->rkey;
+ seg1->mr_rkey = mw->fmr.fmr->rkey;
seg1->mr_base = seg1->mr_dma + pageoff;
seg1->mr_nsegs = i;
seg1->mr_len = len;
@@ -179,6 +179,69 @@ out_maperr:
return rc;
}
+static void
+__fmr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
+{
+ struct ib_device *device = r_xprt->rx_ia.ri_device;
+ struct rpcrdma_mw *mw = seg->rl_mw;
+ int nsegs = seg->mr_nsegs;
+
+ seg->rl_mw = NULL;
+
+ while (nsegs--)
+ rpcrdma_unmap_one(device, seg++);
+
+ rpcrdma_put_mw(r_xprt, mw);
+}
+
+/* Invalidate all memory regions that were registered for "req".
+ *
+ * Sleeps until it is safe for the host CPU to access the
+ * previously mapped memory regions.
+ */
+static void
+fmr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+ struct rpcrdma_mr_seg *seg;
+ unsigned int i, nchunks;
+ struct rpcrdma_mw *mw;
+ LIST_HEAD(unmap_list);
+ int rc;
+
+ dprintk("RPC: %s: req %p\n", __func__, req);
+
+ /* ORDER: Invalidate all of the req's MRs first
+ *
+ * ib_unmap_fmr() is slow, so use a single call instead
+ * of one call per mapped MR.
+ */
+ for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+ seg = &req->rl_segments[i];
+ mw = seg->rl_mw;
+
+ list_add(&mw->fmr.fmr->list, &unmap_list);
+
+ i += seg->mr_nsegs;
+ }
+ rc = ib_unmap_fmr(&unmap_list);
+ if (rc)
+ pr_warn("%s: ib_unmap_fmr failed (%i)\n", __func__, rc);
+
+ /* ORDER: Now DMA unmap all of the req's MRs, and return
+ * them to the free MW list.
+ */
+ for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+ seg = &req->rl_segments[i];
+
+ __fmr_dma_unmap(r_xprt, seg);
+
+ i += seg->mr_nsegs;
+ seg->mr_nsegs = 0;
+ }
+
+ req->rl_nchunks = 0;
+}
+
/* Use the ib_unmap_fmr() verb to prevent further remote
* access via RDMA READ or RDMA WRITE.
*/
@@ -218,9 +281,9 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
while (!list_empty(&buf->rb_all)) {
r = list_entry(buf->rb_all.next, struct rpcrdma_mw, mw_all);
list_del(&r->mw_all);
- kfree(r->r.fmr.physaddrs);
+ kfree(r->fmr.physaddrs);
- rc = ib_dealloc_fmr(r->r.fmr.fmr);
+ rc = ib_dealloc_fmr(r->fmr.fmr);
if (rc)
dprintk("RPC: %s: ib_dealloc_fmr failed %i\n",
__func__, rc);
@@ -231,6 +294,7 @@ fmr_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_fmr_memreg_ops = {
.ro_map = fmr_op_map,
+ .ro_unmap_sync = fmr_op_unmap_sync,
.ro_unmap = fmr_op_unmap,
.ro_open = fmr_op_open,
.ro_maxpages = fmr_op_maxpages,
diff --git a/net/sunrpc/xprtrdma/frwr_ops.c b/net/sunrpc/xprtrdma/frwr_ops.c
index 5318951b3b53..c250924a9fd3 100644
--- a/net/sunrpc/xprtrdma/frwr_ops.c
+++ b/net/sunrpc/xprtrdma/frwr_ops.c
@@ -109,20 +109,20 @@ static void
__frwr_recovery_worker(struct work_struct *work)
{
struct rpcrdma_mw *r = container_of(work, struct rpcrdma_mw,
- r.frmr.fr_work);
- struct rpcrdma_xprt *r_xprt = r->r.frmr.fr_xprt;
+ frmr.fr_work);
+ struct rpcrdma_xprt *r_xprt = r->frmr.fr_xprt;
unsigned int depth = r_xprt->rx_ia.ri_max_frmr_depth;
struct ib_pd *pd = r_xprt->rx_ia.ri_pd;
- if (ib_dereg_mr(r->r.frmr.fr_mr))
+ if (ib_dereg_mr(r->frmr.fr_mr))
goto out_fail;
- r->r.frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
- if (IS_ERR(r->r.frmr.fr_mr))
+ r->frmr.fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
+ if (IS_ERR(r->frmr.fr_mr))
goto out_fail;
dprintk("RPC: %s: recovered FRMR %p\n", __func__, r);
- r->r.frmr.fr_state = FRMR_IS_INVALID;
+ r->frmr.fr_state = FRMR_IS_INVALID;
rpcrdma_put_mw(r_xprt, r);
return;
@@ -137,23 +137,29 @@ out_fail:
static void
__frwr_queue_recovery(struct rpcrdma_mw *r)
{
- INIT_WORK(&r->r.frmr.fr_work, __frwr_recovery_worker);
- queue_work(frwr_recovery_wq, &r->r.frmr.fr_work);
+ INIT_WORK(&r->frmr.fr_work, __frwr_recovery_worker);
+ queue_work(frwr_recovery_wq, &r->frmr.fr_work);
}
static int
__frwr_init(struct rpcrdma_mw *r, struct ib_pd *pd, struct ib_device *device,
unsigned int depth)
{
- struct rpcrdma_frmr *f = &r->r.frmr;
+ struct rpcrdma_frmr *f = &r->frmr;
int rc;
f->fr_mr = ib_alloc_mr(pd, IB_MR_TYPE_MEM_REG, depth);
if (IS_ERR(f->fr_mr))
goto out_mr_err;
- f->fr_pgl = ib_alloc_fast_reg_page_list(device, depth);
- if (IS_ERR(f->fr_pgl))
+
+ f->sg = kcalloc(depth, sizeof(*f->sg), GFP_KERNEL);
+ if (!f->sg)
goto out_list_err;
+
+ sg_init_table(f->sg, depth);
+
+ init_completion(&f->fr_linv_done);
+
return 0;
out_mr_err:
@@ -163,9 +169,9 @@ out_mr_err:
return rc;
out_list_err:
- rc = PTR_ERR(f->fr_pgl);
- dprintk("RPC: %s: ib_alloc_fast_reg_page_list status %i\n",
- __func__, rc);
+ rc = -ENOMEM;
+ dprintk("RPC: %s: sg allocation failure\n",
+ __func__);
ib_dereg_mr(f->fr_mr);
return rc;
}
@@ -175,23 +181,22 @@ __frwr_release(struct rpcrdma_mw *r)
{
int rc;
- rc = ib_dereg_mr(r->r.frmr.fr_mr);
+ rc = ib_dereg_mr(r->frmr.fr_mr);
if (rc)
dprintk("RPC: %s: ib_dereg_mr status %i\n",
__func__, rc);
- ib_free_fast_reg_page_list(r->r.frmr.fr_pgl);
+ kfree(r->frmr.sg);
}
static int
frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
struct rpcrdma_create_data_internal *cdata)
{
- struct ib_device_attr *devattr = &ia->ri_devattr;
int depth, delta;
ia->ri_max_frmr_depth =
min_t(unsigned int, RPCRDMA_MAX_DATA_SEGS,
- devattr->max_fast_reg_page_list_len);
+ ia->ri_device->attrs.max_fast_reg_page_list_len);
dprintk("RPC: %s: device's max FR page list len = %u\n",
__func__, ia->ri_max_frmr_depth);
@@ -218,8 +223,8 @@ frwr_op_open(struct rpcrdma_ia *ia, struct rpcrdma_ep *ep,
}
ep->rep_attr.cap.max_send_wr *= depth;
- if (ep->rep_attr.cap.max_send_wr > devattr->max_qp_wr) {
- cdata->max_requests = devattr->max_qp_wr / depth;
+ if (ep->rep_attr.cap.max_send_wr > ia->ri_device->attrs.max_qp_wr) {
+ cdata->max_requests = ia->ri_device->attrs.max_qp_wr / depth;
if (!cdata->max_requests)
return -EINVAL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests *
@@ -241,20 +246,76 @@ frwr_op_maxpages(struct rpcrdma_xprt *r_xprt)
rpcrdma_max_segments(r_xprt) * ia->ri_max_frmr_depth);
}
-/* If FAST_REG or LOCAL_INV failed, indicate the frmr needs to be reset. */
static void
-frwr_sendcompletion(struct ib_wc *wc)
+__frwr_sendcompletion_flush(struct ib_wc *wc, struct rpcrdma_frmr *frmr,
+ const char *wr)
{
- struct rpcrdma_mw *r;
+ frmr->fr_state = FRMR_IS_STALE;
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ pr_err("rpcrdma: %s: %s (%u/0x%x)\n",
+ wr, ib_wc_status_msg(wc->status),
+ wc->status, wc->vendor_err);
+}
- if (likely(wc->status == IB_WC_SUCCESS))
- return;
+/**
+ * frwr_wc_fastreg - Invoked by RDMA provider for each polled FastReg WC
+ * @cq: completion queue (ignored)
+ * @wc: completed WR
+ *
+ */
+static void
+frwr_wc_fastreg(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rpcrdma_frmr *frmr;
+ struct ib_cqe *cqe;
- /* WARNING: Only wr_id and status are reliable at this point */
- r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
- pr_warn("RPC: %s: frmr %p flushed, status %s (%d)\n",
- __func__, r, ib_wc_status_msg(wc->status), wc->status);
- r->r.frmr.fr_state = FRMR_IS_STALE;
+ /* WARNING: Only wr_cqe and status are reliable at this point */
+ if (wc->status != IB_WC_SUCCESS) {
+ cqe = wc->wr_cqe;
+ frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
+ __frwr_sendcompletion_flush(wc, frmr, "fastreg");
+ }
+}
+
+/**
+ * frwr_wc_localinv - Invoked by RDMA provider for each polled LocalInv WC
+ * @cq: completion queue (ignored)
+ * @wc: completed WR
+ *
+ */
+static void
+frwr_wc_localinv(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rpcrdma_frmr *frmr;
+ struct ib_cqe *cqe;
+
+ /* WARNING: Only wr_cqe and status are reliable at this point */
+ if (wc->status != IB_WC_SUCCESS) {
+ cqe = wc->wr_cqe;
+ frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
+ __frwr_sendcompletion_flush(wc, frmr, "localinv");
+ }
+}
+
+/**
+ * frwr_wc_localinv - Invoked by RDMA provider for each polled LocalInv WC
+ * @cq: completion queue (ignored)
+ * @wc: completed WR
+ *
+ * Awaken anyone waiting for an MR to finish being fenced.
+ */
+static void
+frwr_wc_localinv_wake(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct rpcrdma_frmr *frmr;
+ struct ib_cqe *cqe;
+
+ /* WARNING: Only wr_cqe and status are reliable at this point */
+ cqe = wc->wr_cqe;
+ frmr = container_of(cqe, struct rpcrdma_frmr, fr_cqe);
+ if (wc->status != IB_WC_SUCCESS)
+ __frwr_sendcompletion_flush(wc, frmr, "localinv");
+ complete_all(&frmr->fr_linv_done);
}
static int
@@ -291,8 +352,7 @@ frwr_op_init(struct rpcrdma_xprt *r_xprt)
list_add(&r->mw_list, &buf->rb_mws);
list_add(&r->mw_all, &buf->rb_all);
- r->mw_sendcompletion = frwr_sendcompletion;
- r->r.frmr.fr_xprt = r_xprt;
+ r->frmr.fr_xprt = r_xprt;
}
return 0;
@@ -312,13 +372,10 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
struct rpcrdma_mw *mw;
struct rpcrdma_frmr *frmr;
struct ib_mr *mr;
- struct ib_send_wr fastreg_wr, *bad_wr;
+ struct ib_reg_wr *reg_wr;
+ struct ib_send_wr *bad_wr;
+ int rc, i, n, dma_nents;
u8 key;
- int len, pageoff;
- int i, rc;
- int seg_len;
- u64 pa;
- int page_no;
mw = seg1->rl_mw;
seg1->rl_mw = NULL;
@@ -328,71 +385,203 @@ frwr_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
mw = rpcrdma_get_mw(r_xprt);
if (!mw)
return -ENOMEM;
- } while (mw->r.frmr.fr_state != FRMR_IS_INVALID);
- frmr = &mw->r.frmr;
+ } while (mw->frmr.fr_state != FRMR_IS_INVALID);
+ frmr = &mw->frmr;
frmr->fr_state = FRMR_IS_VALID;
+ mr = frmr->fr_mr;
+ reg_wr = &frmr->fr_regwr;
- pageoff = offset_in_page(seg1->mr_offset);
- seg1->mr_offset -= pageoff; /* start of page */
- seg1->mr_len += pageoff;
- len = -pageoff;
if (nsegs > ia->ri_max_frmr_depth)
nsegs = ia->ri_max_frmr_depth;
- for (page_no = i = 0; i < nsegs;) {
- rpcrdma_map_one(device, seg, direction);
- pa = seg->mr_dma;
- for (seg_len = seg->mr_len; seg_len > 0; seg_len -= PAGE_SIZE) {
- frmr->fr_pgl->page_list[page_no++] = pa;
- pa += PAGE_SIZE;
- }
- len += seg->mr_len;
+ for (i = 0; i < nsegs;) {
+ if (seg->mr_page)
+ sg_set_page(&frmr->sg[i],
+ seg->mr_page,
+ seg->mr_len,
+ offset_in_page(seg->mr_offset));
+ else
+ sg_set_buf(&frmr->sg[i], seg->mr_offset,
+ seg->mr_len);
+
++seg;
++i;
+
/* Check for holes */
if ((i < nsegs && offset_in_page(seg->mr_offset)) ||
offset_in_page((seg-1)->mr_offset + (seg-1)->mr_len))
break;
}
- dprintk("RPC: %s: Using frmr %p to map %d segments (%d bytes)\n",
- __func__, mw, i, len);
-
- memset(&fastreg_wr, 0, sizeof(fastreg_wr));
- fastreg_wr.wr_id = (unsigned long)(void *)mw;
- fastreg_wr.opcode = IB_WR_FAST_REG_MR;
- fastreg_wr.wr.fast_reg.iova_start = seg1->mr_dma + pageoff;
- fastreg_wr.wr.fast_reg.page_list = frmr->fr_pgl;
- fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- fastreg_wr.wr.fast_reg.page_list_len = page_no;
- fastreg_wr.wr.fast_reg.length = len;
- fastreg_wr.wr.fast_reg.access_flags = writing ?
- IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
- IB_ACCESS_REMOTE_READ;
- mr = frmr->fr_mr;
+ frmr->sg_nents = i;
+
+ dma_nents = ib_dma_map_sg(device, frmr->sg, frmr->sg_nents, direction);
+ if (!dma_nents) {
+ pr_err("RPC: %s: failed to dma map sg %p sg_nents %u\n",
+ __func__, frmr->sg, frmr->sg_nents);
+ return -ENOMEM;
+ }
+
+ n = ib_map_mr_sg(mr, frmr->sg, frmr->sg_nents, PAGE_SIZE);
+ if (unlikely(n != frmr->sg_nents)) {
+ pr_err("RPC: %s: failed to map mr %p (%u/%u)\n",
+ __func__, frmr->fr_mr, n, frmr->sg_nents);
+ rc = n < 0 ? n : -EINVAL;
+ goto out_senderr;
+ }
+
+ dprintk("RPC: %s: Using frmr %p to map %u segments (%u bytes)\n",
+ __func__, mw, frmr->sg_nents, mr->length);
+
key = (u8)(mr->rkey & 0x000000FF);
ib_update_fast_reg_key(mr, ++key);
- fastreg_wr.wr.fast_reg.rkey = mr->rkey;
+
+ reg_wr->wr.next = NULL;
+ reg_wr->wr.opcode = IB_WR_REG_MR;
+ frmr->fr_cqe.done = frwr_wc_fastreg;
+ reg_wr->wr.wr_cqe = &frmr->fr_cqe;
+ reg_wr->wr.num_sge = 0;
+ reg_wr->wr.send_flags = 0;
+ reg_wr->mr = mr;
+ reg_wr->key = mr->rkey;
+ reg_wr->access = writing ?
+ IB_ACCESS_REMOTE_WRITE | IB_ACCESS_LOCAL_WRITE :
+ IB_ACCESS_REMOTE_READ;
DECR_CQCOUNT(&r_xprt->rx_ep);
- rc = ib_post_send(ia->ri_id->qp, &fastreg_wr, &bad_wr);
+ rc = ib_post_send(ia->ri_id->qp, &reg_wr->wr, &bad_wr);
if (rc)
goto out_senderr;
+ seg1->mr_dir = direction;
seg1->rl_mw = mw;
seg1->mr_rkey = mr->rkey;
- seg1->mr_base = seg1->mr_dma + pageoff;
- seg1->mr_nsegs = i;
- seg1->mr_len = len;
- return i;
+ seg1->mr_base = mr->iova;
+ seg1->mr_nsegs = frmr->sg_nents;
+ seg1->mr_len = mr->length;
+
+ return frmr->sg_nents;
out_senderr:
dprintk("RPC: %s: ib_post_send status %i\n", __func__, rc);
- while (i--)
- rpcrdma_unmap_one(device, --seg);
+ ib_dma_unmap_sg(device, frmr->sg, dma_nents, direction);
__frwr_queue_recovery(mw);
return rc;
}
+static struct ib_send_wr *
+__frwr_prepare_linv_wr(struct rpcrdma_mr_seg *seg)
+{
+ struct rpcrdma_mw *mw = seg->rl_mw;
+ struct rpcrdma_frmr *f = &mw->frmr;
+ struct ib_send_wr *invalidate_wr;
+
+ f->fr_state = FRMR_IS_INVALID;
+ invalidate_wr = &f->fr_invwr;
+
+ memset(invalidate_wr, 0, sizeof(*invalidate_wr));
+ f->fr_cqe.done = frwr_wc_localinv;
+ invalidate_wr->wr_cqe = &f->fr_cqe;
+ invalidate_wr->opcode = IB_WR_LOCAL_INV;
+ invalidate_wr->ex.invalidate_rkey = f->fr_mr->rkey;
+
+ return invalidate_wr;
+}
+
+static void
+__frwr_dma_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
+ int rc)
+{
+ struct ib_device *device = r_xprt->rx_ia.ri_device;
+ struct rpcrdma_mw *mw = seg->rl_mw;
+ struct rpcrdma_frmr *f = &mw->frmr;
+
+ seg->rl_mw = NULL;
+
+ ib_dma_unmap_sg(device, f->sg, f->sg_nents, seg->mr_dir);
+
+ if (!rc)
+ rpcrdma_put_mw(r_xprt, mw);
+ else
+ __frwr_queue_recovery(mw);
+}
+
+/* Invalidate all memory regions that were registered for "req".
+ *
+ * Sleeps until it is safe for the host CPU to access the
+ * previously mapped memory regions.
+ */
+static void
+frwr_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+ struct ib_send_wr *invalidate_wrs, *pos, *prev, *bad_wr;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_mr_seg *seg;
+ unsigned int i, nchunks;
+ struct rpcrdma_frmr *f;
+ int rc;
+
+ dprintk("RPC: %s: req %p\n", __func__, req);
+
+ /* ORDER: Invalidate all of the req's MRs first
+ *
+ * Chain the LOCAL_INV Work Requests and post them with
+ * a single ib_post_send() call.
+ */
+ invalidate_wrs = pos = prev = NULL;
+ seg = NULL;
+ for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+ seg = &req->rl_segments[i];
+
+ pos = __frwr_prepare_linv_wr(seg);
+
+ if (!invalidate_wrs)
+ invalidate_wrs = pos;
+ else
+ prev->next = pos;
+ prev = pos;
+
+ i += seg->mr_nsegs;
+ }
+ f = &seg->rl_mw->frmr;
+
+ /* Strong send queue ordering guarantees that when the
+ * last WR in the chain completes, all WRs in the chain
+ * are complete.
+ */
+ f->fr_invwr.send_flags = IB_SEND_SIGNALED;
+ f->fr_cqe.done = frwr_wc_localinv_wake;
+ reinit_completion(&f->fr_linv_done);
+ INIT_CQCOUNT(&r_xprt->rx_ep);
+
+ /* Transport disconnect drains the receive CQ before it
+ * replaces the QP. The RPC reply handler won't call us
+ * unless ri_id->qp is a valid pointer.
+ */
+ rc = ib_post_send(ia->ri_id->qp, invalidate_wrs, &bad_wr);
+ if (rc) {
+ pr_warn("%s: ib_post_send failed %i\n", __func__, rc);
+ rdma_disconnect(ia->ri_id);
+ goto unmap;
+ }
+
+ wait_for_completion(&f->fr_linv_done);
+
+ /* ORDER: Now DMA unmap all of the req's MRs, and return
+ * them to the free MW list.
+ */
+unmap:
+ for (i = 0, nchunks = req->rl_nchunks; nchunks; nchunks--) {
+ seg = &req->rl_segments[i];
+
+ __frwr_dma_unmap(r_xprt, seg, rc);
+
+ i += seg->mr_nsegs;
+ seg->mr_nsegs = 0;
+ }
+
+ req->rl_nchunks = 0;
+}
+
/* Post a LOCAL_INV Work Request to prevent further remote access
* via RDMA READ or RDMA WRITE.
*/
@@ -402,24 +591,26 @@ frwr_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
struct rpcrdma_mr_seg *seg1 = seg;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
struct rpcrdma_mw *mw = seg1->rl_mw;
- struct ib_send_wr invalidate_wr, *bad_wr;
+ struct rpcrdma_frmr *frmr = &mw->frmr;
+ struct ib_send_wr *invalidate_wr, *bad_wr;
int rc, nsegs = seg->mr_nsegs;
dprintk("RPC: %s: FRMR %p\n", __func__, mw);
seg1->rl_mw = NULL;
- mw->r.frmr.fr_state = FRMR_IS_INVALID;
-
- memset(&invalidate_wr, 0, sizeof(invalidate_wr));
- invalidate_wr.wr_id = (unsigned long)(void *)mw;
- invalidate_wr.opcode = IB_WR_LOCAL_INV;
- invalidate_wr.ex.invalidate_rkey = mw->r.frmr.fr_mr->rkey;
+ frmr->fr_state = FRMR_IS_INVALID;
+ invalidate_wr = &mw->frmr.fr_invwr;
+
+ memset(invalidate_wr, 0, sizeof(*invalidate_wr));
+ frmr->fr_cqe.done = frwr_wc_localinv;
+ invalidate_wr->wr_cqe = &frmr->fr_cqe;
+ invalidate_wr->opcode = IB_WR_LOCAL_INV;
+ invalidate_wr->ex.invalidate_rkey = frmr->fr_mr->rkey;
DECR_CQCOUNT(&r_xprt->rx_ep);
- while (seg1->mr_nsegs--)
- rpcrdma_unmap_one(ia->ri_device, seg++);
+ ib_dma_unmap_sg(ia->ri_device, frmr->sg, frmr->sg_nents, seg1->mr_dir);
read_lock(&ia->ri_qplock);
- rc = ib_post_send(ia->ri_id->qp, &invalidate_wr, &bad_wr);
+ rc = ib_post_send(ia->ri_id->qp, invalidate_wr, &bad_wr);
read_unlock(&ia->ri_qplock);
if (rc)
goto out_err;
@@ -451,6 +642,7 @@ frwr_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_frwr_memreg_ops = {
.ro_map = frwr_op_map,
+ .ro_unmap_sync = frwr_op_unmap_sync,
.ro_unmap = frwr_op_unmap,
.ro_open = frwr_op_open,
.ro_maxpages = frwr_op_maxpages,
diff --git a/net/sunrpc/xprtrdma/physical_ops.c b/net/sunrpc/xprtrdma/physical_ops.c
index 617b76f22154..481b9b6f4a15 100644
--- a/net/sunrpc/xprtrdma/physical_ops.c
+++ b/net/sunrpc/xprtrdma/physical_ops.c
@@ -68,7 +68,6 @@ physical_op_map(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg,
rpcrdma_map_one(ia->ri_device, seg, rpcrdma_data_dir(writing));
seg->mr_rkey = ia->ri_dma_mr->rkey;
seg->mr_base = seg->mr_dma;
- seg->mr_nsegs = 1;
return 1;
}
@@ -83,6 +82,18 @@ physical_op_unmap(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mr_seg *seg)
return 1;
}
+/* DMA unmap all memory regions that were mapped for "req".
+ */
+static void
+physical_op_unmap_sync(struct rpcrdma_xprt *r_xprt, struct rpcrdma_req *req)
+{
+ struct ib_device *device = r_xprt->rx_ia.ri_device;
+ unsigned int i;
+
+ for (i = 0; req->rl_nchunks; --req->rl_nchunks)
+ rpcrdma_unmap_one(device, &req->rl_segments[i++]);
+}
+
static void
physical_op_destroy(struct rpcrdma_buffer *buf)
{
@@ -90,6 +101,7 @@ physical_op_destroy(struct rpcrdma_buffer *buf)
const struct rpcrdma_memreg_ops rpcrdma_physical_memreg_ops = {
.ro_map = physical_op_map,
+ .ro_unmap_sync = physical_op_unmap_sync,
.ro_unmap = physical_op_unmap,
.ro_open = physical_op_open,
.ro_maxpages = physical_op_maxpages,
diff --git a/net/sunrpc/xprtrdma/rpc_rdma.c b/net/sunrpc/xprtrdma/rpc_rdma.c
index bc8bd6577467..888823bb6dae 100644
--- a/net/sunrpc/xprtrdma/rpc_rdma.c
+++ b/net/sunrpc/xprtrdma/rpc_rdma.c
@@ -132,6 +132,33 @@ rpcrdma_tail_pullup(struct xdr_buf *buf)
return tlen;
}
+/* Split "vec" on page boundaries into segments. FMR registers pages,
+ * not a byte range. Other modes coalesce these segments into a single
+ * MR when they can.
+ */
+static int
+rpcrdma_convert_kvec(struct kvec *vec, struct rpcrdma_mr_seg *seg,
+ int n, int nsegs)
+{
+ size_t page_offset;
+ u32 remaining;
+ char *base;
+
+ base = vec->iov_base;
+ page_offset = offset_in_page(base);
+ remaining = vec->iov_len;
+ while (remaining && n < nsegs) {
+ seg[n].mr_page = NULL;
+ seg[n].mr_offset = base;
+ seg[n].mr_len = min_t(u32, PAGE_SIZE - page_offset, remaining);
+ remaining -= seg[n].mr_len;
+ base += seg[n].mr_len;
+ ++n;
+ page_offset = 0;
+ }
+ return n;
+}
+
/*
* Chunk assembly from upper layer xdr_buf.
*
@@ -150,11 +177,10 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
int page_base;
struct page **ppages;
- if (pos == 0 && xdrbuf->head[0].iov_len) {
- seg[n].mr_page = NULL;
- seg[n].mr_offset = xdrbuf->head[0].iov_base;
- seg[n].mr_len = xdrbuf->head[0].iov_len;
- ++n;
+ if (pos == 0) {
+ n = rpcrdma_convert_kvec(&xdrbuf->head[0], seg, n, nsegs);
+ if (n == nsegs)
+ return -EIO;
}
len = xdrbuf->page_len;
@@ -192,13 +218,9 @@ rpcrdma_convert_iovs(struct xdr_buf *xdrbuf, unsigned int pos,
* xdr pad bytes, saving the server an RDMA operation. */
if (xdrbuf->tail[0].iov_len < 4 && xprt_rdma_pad_optimize)
return n;
+ n = rpcrdma_convert_kvec(&xdrbuf->tail[0], seg, n, nsegs);
if (n == nsegs)
- /* Tail remains, but we're out of segments */
return -EIO;
- seg[n].mr_page = NULL;
- seg[n].mr_offset = xdrbuf->tail[0].iov_base;
- seg[n].mr_len = xdrbuf->tail[0].iov_len;
- ++n;
}
return n;
@@ -441,6 +463,11 @@ rpcrdma_marshal_req(struct rpc_rqst *rqst)
enum rpcrdma_chunktype rtype, wtype;
struct rpcrdma_msg *headerp;
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ if (test_bit(RPC_BC_PA_IN_USE, &rqst->rq_bc_pa_state))
+ return rpcrdma_bc_marshal_reply(rqst);
+#endif
+
/*
* rpclen gets amount of data in first buffer, which is the
* pre-registered buffer.
@@ -711,6 +738,37 @@ rpcrdma_connect_worker(struct work_struct *work)
spin_unlock_bh(&xprt->transport_lock);
}
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+/* By convention, backchannel calls arrive via rdma_msg type
+ * messages, and never populate the chunk lists. This makes
+ * the RPC/RDMA header small and fixed in size, so it is
+ * straightforward to check the RPC header's direction field.
+ */
+static bool
+rpcrdma_is_bcall(struct rpcrdma_msg *headerp)
+{
+ __be32 *p = (__be32 *)headerp;
+
+ if (headerp->rm_type != rdma_msg)
+ return false;
+ if (headerp->rm_body.rm_chunks[0] != xdr_zero)
+ return false;
+ if (headerp->rm_body.rm_chunks[1] != xdr_zero)
+ return false;
+ if (headerp->rm_body.rm_chunks[2] != xdr_zero)
+ return false;
+
+ /* sanity */
+ if (p[7] != headerp->rm_xid)
+ return false;
+ /* call direction */
+ if (p[8] != cpu_to_be32(RPC_CALL))
+ return false;
+
+ return true;
+}
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
/*
* This function is called when an async event is posted to
* the connection which changes the connection state. All it
@@ -723,8 +781,8 @@ rpcrdma_conn_func(struct rpcrdma_ep *ep)
schedule_delayed_work(&ep->rep_connect_worker, 0);
}
-/*
- * Called as a tasklet to do req/reply match and complete a request
+/* Process received RPC/RDMA messages.
+ *
* Errors must result in the RPC task either being awakened, or
* allowed to timeout, to discover the errors at that time.
*/
@@ -737,66 +795,49 @@ rpcrdma_reply_handler(struct rpcrdma_rep *rep)
struct rpcrdma_xprt *r_xprt = rep->rr_rxprt;
struct rpc_xprt *xprt = &r_xprt->rx_xprt;
__be32 *iptr;
- int rdmalen, status;
+ int rdmalen, status, rmerr;
unsigned long cwnd;
- u32 credits;
- /* Check status. If bad, signal disconnect and return rep to pool */
- if (rep->rr_len == ~0U) {
- rpcrdma_recv_buffer_put(rep);
- if (r_xprt->rx_ep.rep_connected == 1) {
- r_xprt->rx_ep.rep_connected = -EIO;
- rpcrdma_conn_func(&r_xprt->rx_ep);
- }
- return;
- }
- if (rep->rr_len < RPCRDMA_HDRLEN_MIN) {
- dprintk("RPC: %s: short/invalid reply\n", __func__);
- goto repost;
- }
+ dprintk("RPC: %s: incoming rep %p\n", __func__, rep);
+
+ if (rep->rr_len == RPCRDMA_BAD_LEN)
+ goto out_badstatus;
+ if (rep->rr_len < RPCRDMA_HDRLEN_ERR)
+ goto out_shortreply;
+
headerp = rdmab_to_msg(rep->rr_rdmabuf);
- if (headerp->rm_vers != rpcrdma_version) {
- dprintk("RPC: %s: invalid version %d\n",
- __func__, be32_to_cpu(headerp->rm_vers));
- goto repost;
- }
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ if (rpcrdma_is_bcall(headerp))
+ goto out_bcall;
+#endif
- /* Get XID and try for a match. */
- spin_lock(&xprt->transport_lock);
+ /* Match incoming rpcrdma_rep to an rpcrdma_req to
+ * get context for handling any incoming chunks.
+ */
+ spin_lock_bh(&xprt->transport_lock);
rqst = xprt_lookup_rqst(xprt, headerp->rm_xid);
- if (rqst == NULL) {
- spin_unlock(&xprt->transport_lock);
- dprintk("RPC: %s: reply 0x%p failed "
- "to match any request xid 0x%08x len %d\n",
- __func__, rep, be32_to_cpu(headerp->rm_xid),
- rep->rr_len);
-repost:
- r_xprt->rx_stats.bad_reply_count++;
- if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
- rpcrdma_recv_buffer_put(rep);
+ if (!rqst)
+ goto out_nomatch;
- return;
- }
-
- /* get request object */
req = rpcr_to_rdmar(rqst);
- if (req->rl_reply) {
- spin_unlock(&xprt->transport_lock);
- dprintk("RPC: %s: duplicate reply 0x%p to RPC "
- "request 0x%p: xid 0x%08x\n", __func__, rep, req,
- be32_to_cpu(headerp->rm_xid));
- goto repost;
- }
+ if (req->rl_reply)
+ goto out_duplicate;
- dprintk("RPC: %s: reply 0x%p completes request 0x%p\n"
- " RPC request 0x%p xid 0x%08x\n",
- __func__, rep, req, rqst,
- be32_to_cpu(headerp->rm_xid));
+ /* Sanity checking has passed. We are now committed
+ * to complete this transaction.
+ */
+ list_del_init(&rqst->rq_list);
+ spin_unlock_bh(&xprt->transport_lock);
+ dprintk("RPC: %s: reply %p completes request %p (xid 0x%08x)\n",
+ __func__, rep, req, be32_to_cpu(headerp->rm_xid));
/* from here on, the reply is no longer an orphan */
req->rl_reply = rep;
xprt->reestablish_timeout = 0;
+ if (headerp->rm_vers != rpcrdma_version)
+ goto out_badversion;
+
/* check for expected message types */
/* The order of some of these tests is important. */
switch (headerp->rm_type) {
@@ -857,6 +898,9 @@ repost:
status = rdmalen;
break;
+ case rdma_error:
+ goto out_rdmaerr;
+
badheader:
default:
dprintk("%s: invalid rpcrdma reply header (type %d):"
@@ -872,19 +916,97 @@ badheader:
break;
}
- credits = be32_to_cpu(headerp->rm_credit);
- if (credits == 0)
- credits = 1; /* don't deadlock */
- else if (credits > r_xprt->rx_buf.rb_max_requests)
- credits = r_xprt->rx_buf.rb_max_requests;
+out:
+ /* Invalidate and flush the data payloads before waking the
+ * waiting application. This guarantees the memory region is
+ * properly fenced from the server before the application
+ * accesses the data. It also ensures proper send flow
+ * control: waking the next RPC waits until this RPC has
+ * relinquished all its Send Queue entries.
+ */
+ if (req->rl_nchunks)
+ r_xprt->rx_ia.ri_ops->ro_unmap_sync(r_xprt, req);
+ spin_lock_bh(&xprt->transport_lock);
cwnd = xprt->cwnd;
- xprt->cwnd = credits << RPC_CWNDSHIFT;
+ xprt->cwnd = atomic_read(&r_xprt->rx_buf.rb_credits) << RPC_CWNDSHIFT;
if (xprt->cwnd > cwnd)
xprt_release_rqst_cong(rqst->rq_task);
+ xprt_complete_rqst(rqst->rq_task, status);
+ spin_unlock_bh(&xprt->transport_lock);
dprintk("RPC: %s: xprt_complete_rqst(0x%p, 0x%p, %d)\n",
__func__, xprt, rqst, status);
- xprt_complete_rqst(rqst->rq_task, status);
- spin_unlock(&xprt->transport_lock);
+ return;
+
+out_badstatus:
+ rpcrdma_recv_buffer_put(rep);
+ if (r_xprt->rx_ep.rep_connected == 1) {
+ r_xprt->rx_ep.rep_connected = -EIO;
+ rpcrdma_conn_func(&r_xprt->rx_ep);
+ }
+ return;
+
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+out_bcall:
+ rpcrdma_bc_receive_call(r_xprt, rep);
+ return;
+#endif
+
+/* If the incoming reply terminated a pending RPC, the next
+ * RPC call will post a replacement receive buffer as it is
+ * being marshaled.
+ */
+out_badversion:
+ dprintk("RPC: %s: invalid version %d\n",
+ __func__, be32_to_cpu(headerp->rm_vers));
+ status = -EIO;
+ r_xprt->rx_stats.bad_reply_count++;
+ goto out;
+
+out_rdmaerr:
+ rmerr = be32_to_cpu(headerp->rm_body.rm_error.rm_err);
+ switch (rmerr) {
+ case ERR_VERS:
+ pr_err("%s: server reports header version error (%u-%u)\n",
+ __func__,
+ be32_to_cpu(headerp->rm_body.rm_error.rm_vers_low),
+ be32_to_cpu(headerp->rm_body.rm_error.rm_vers_high));
+ break;
+ case ERR_CHUNK:
+ pr_err("%s: server reports header decoding error\n",
+ __func__);
+ break;
+ default:
+ pr_err("%s: server reports unknown error %d\n",
+ __func__, rmerr);
+ }
+ status = -EREMOTEIO;
+ r_xprt->rx_stats.bad_reply_count++;
+ goto out;
+
+/* If no pending RPC transaction was matched, post a replacement
+ * receive buffer before returning.
+ */
+out_shortreply:
+ dprintk("RPC: %s: short/invalid reply\n", __func__);
+ goto repost;
+
+out_nomatch:
+ spin_unlock_bh(&xprt->transport_lock);
+ dprintk("RPC: %s: no match for incoming xid 0x%08x len %d\n",
+ __func__, be32_to_cpu(headerp->rm_xid),
+ rep->rr_len);
+ goto repost;
+
+out_duplicate:
+ spin_unlock_bh(&xprt->transport_lock);
+ dprintk("RPC: %s: "
+ "duplicate reply %p to RPC request %p: xid 0x%08x\n",
+ __func__, rep, req, be32_to_cpu(headerp->rm_xid));
+
+repost:
+ r_xprt->rx_stats.bad_reply_count++;
+ if (rpcrdma_ep_post_recv(&r_xprt->rx_ia, &r_xprt->rx_ep, rep))
+ rpcrdma_recv_buffer_put(rep);
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma.c b/net/sunrpc/xprtrdma/svc_rdma.c
index 2cd252f023a5..c846ca9f1eba 100644
--- a/net/sunrpc/xprtrdma/svc_rdma.c
+++ b/net/sunrpc/xprtrdma/svc_rdma.c
@@ -55,6 +55,7 @@ unsigned int svcrdma_ord = RPCRDMA_ORD;
static unsigned int min_ord = 1;
static unsigned int max_ord = 4096;
unsigned int svcrdma_max_requests = RPCRDMA_MAX_REQUESTS;
+unsigned int svcrdma_max_bc_requests = RPCRDMA_MAX_BC_REQUESTS;
static unsigned int min_max_requests = 4;
static unsigned int max_max_requests = 16384;
unsigned int svcrdma_max_req_size = RPCRDMA_MAX_REQ_SIZE;
@@ -71,10 +72,6 @@ atomic_t rdma_stat_rq_prod;
atomic_t rdma_stat_sq_poll;
atomic_t rdma_stat_sq_prod;
-/* Temporary NFS request map and context caches */
-struct kmem_cache *svc_rdma_map_cachep;
-struct kmem_cache *svc_rdma_ctxt_cachep;
-
struct workqueue_struct *svc_rdma_wq;
/*
@@ -239,18 +236,20 @@ void svc_rdma_cleanup(void)
unregister_sysctl_table(svcrdma_table_header);
svcrdma_table_header = NULL;
}
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ svc_unreg_xprt_class(&svc_rdma_bc_class);
+#endif
svc_unreg_xprt_class(&svc_rdma_class);
- kmem_cache_destroy(svc_rdma_map_cachep);
- kmem_cache_destroy(svc_rdma_ctxt_cachep);
}
int svc_rdma_init(void)
{
dprintk("SVCRDMA Module Init, register RPC RDMA transport\n");
dprintk("\tsvcrdma_ord : %d\n", svcrdma_ord);
- dprintk("\tmax_requests : %d\n", svcrdma_max_requests);
- dprintk("\tsq_depth : %d\n",
+ dprintk("\tmax_requests : %u\n", svcrdma_max_requests);
+ dprintk("\tsq_depth : %u\n",
svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT);
+ dprintk("\tmax_bc_requests : %u\n", svcrdma_max_bc_requests);
dprintk("\tmax_inline : %d\n", svcrdma_max_req_size);
svc_rdma_wq = alloc_workqueue("svc_rdma", 0, 0);
@@ -261,36 +260,10 @@ int svc_rdma_init(void)
svcrdma_table_header =
register_sysctl_table(svcrdma_root_table);
- /* Create the temporary map cache */
- svc_rdma_map_cachep = kmem_cache_create("svc_rdma_map_cache",
- sizeof(struct svc_rdma_req_map),
- 0,
- SLAB_HWCACHE_ALIGN,
- NULL);
- if (!svc_rdma_map_cachep) {
- printk(KERN_INFO "Could not allocate map cache.\n");
- goto err0;
- }
-
- /* Create the temporary context cache */
- svc_rdma_ctxt_cachep =
- kmem_cache_create("svc_rdma_ctxt_cache",
- sizeof(struct svc_rdma_op_ctxt),
- 0,
- SLAB_HWCACHE_ALIGN,
- NULL);
- if (!svc_rdma_ctxt_cachep) {
- printk(KERN_INFO "Could not allocate WR ctxt cache.\n");
- goto err1;
- }
-
/* Register RDMA with the SVC transport switch */
svc_reg_xprt_class(&svc_rdma_class);
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ svc_reg_xprt_class(&svc_rdma_bc_class);
+#endif
return 0;
- err1:
- kmem_cache_destroy(svc_rdma_map_cachep);
- err0:
- unregister_sysctl_table(svcrdma_table_header);
- destroy_workqueue(svc_rdma_wq);
- return -ENOMEM;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_backchannel.c b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
new file mode 100644
index 000000000000..a2a7519b0f23
--- /dev/null
+++ b/net/sunrpc/xprtrdma/svc_rdma_backchannel.c
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2015 Oracle. All rights reserved.
+ *
+ * Support for backward direction RPCs on RPC/RDMA (server-side).
+ */
+
+#include <linux/sunrpc/svc_rdma.h>
+#include "xprt_rdma.h"
+
+#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+
+#undef SVCRDMA_BACKCHANNEL_DEBUG
+
+int svc_rdma_handle_bc_reply(struct rpc_xprt *xprt, struct rpcrdma_msg *rmsgp,
+ struct xdr_buf *rcvbuf)
+{
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct kvec *dst, *src = &rcvbuf->head[0];
+ struct rpc_rqst *req;
+ unsigned long cwnd;
+ u32 credits;
+ size_t len;
+ __be32 xid;
+ __be32 *p;
+ int ret;
+
+ p = (__be32 *)src->iov_base;
+ len = src->iov_len;
+ xid = rmsgp->rm_xid;
+
+#ifdef SVCRDMA_BACKCHANNEL_DEBUG
+ pr_info("%s: xid=%08x, length=%zu\n",
+ __func__, be32_to_cpu(xid), len);
+ pr_info("%s: RPC/RDMA: %*ph\n",
+ __func__, (int)RPCRDMA_HDRLEN_MIN, rmsgp);
+ pr_info("%s: RPC: %*ph\n",
+ __func__, (int)len, p);
+#endif
+
+ ret = -EAGAIN;
+ if (src->iov_len < 24)
+ goto out_shortreply;
+
+ spin_lock_bh(&xprt->transport_lock);
+ req = xprt_lookup_rqst(xprt, xid);
+ if (!req)
+ goto out_notfound;
+
+ dst = &req->rq_private_buf.head[0];
+ memcpy(&req->rq_private_buf, &req->rq_rcv_buf, sizeof(struct xdr_buf));
+ if (dst->iov_len < len)
+ goto out_unlock;
+ memcpy(dst->iov_base, p, len);
+
+ credits = be32_to_cpu(rmsgp->rm_credit);
+ if (credits == 0)
+ credits = 1; /* don't deadlock */
+ else if (credits > r_xprt->rx_buf.rb_bc_max_requests)
+ credits = r_xprt->rx_buf.rb_bc_max_requests;
+
+ cwnd = xprt->cwnd;
+ xprt->cwnd = credits << RPC_CWNDSHIFT;
+ if (xprt->cwnd > cwnd)
+ xprt_release_rqst_cong(req->rq_task);
+
+ ret = 0;
+ xprt_complete_rqst(req->rq_task, rcvbuf->len);
+ rcvbuf->len = 0;
+
+out_unlock:
+ spin_unlock_bh(&xprt->transport_lock);
+out:
+ return ret;
+
+out_shortreply:
+ dprintk("svcrdma: short bc reply: xprt=%p, len=%zu\n",
+ xprt, src->iov_len);
+ goto out;
+
+out_notfound:
+ dprintk("svcrdma: unrecognized bc reply: xprt=%p, xid=%08x\n",
+ xprt, be32_to_cpu(xid));
+
+ goto out_unlock;
+}
+
+/* Send a backwards direction RPC call.
+ *
+ * Caller holds the connection's mutex and has already marshaled
+ * the RPC/RDMA request.
+ *
+ * This is similar to svc_rdma_reply, but takes an rpc_rqst
+ * instead, does not support chunks, and avoids blocking memory
+ * allocation.
+ *
+ * XXX: There is still an opportunity to block in svc_rdma_send()
+ * if there are no SQ entries to post the Send. This may occur if
+ * the adapter has a small maximum SQ depth.
+ */
+static int svc_rdma_bc_sendto(struct svcxprt_rdma *rdma,
+ struct rpc_rqst *rqst)
+{
+ struct xdr_buf *sndbuf = &rqst->rq_snd_buf;
+ struct svc_rdma_op_ctxt *ctxt;
+ struct svc_rdma_req_map *vec;
+ struct ib_send_wr send_wr;
+ int ret;
+
+ vec = svc_rdma_get_req_map(rdma);
+ ret = svc_rdma_map_xdr(rdma, sndbuf, vec, false);
+ if (ret)
+ goto out_err;
+
+ ret = svc_rdma_repost_recv(rdma, GFP_NOIO);
+ if (ret)
+ goto out_err;
+
+ ctxt = svc_rdma_get_context(rdma);
+ ctxt->pages[0] = virt_to_page(rqst->rq_buffer);
+ ctxt->count = 1;
+
+ ctxt->direction = DMA_TO_DEVICE;
+ ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
+ ctxt->sge[0].length = sndbuf->len;
+ ctxt->sge[0].addr =
+ ib_dma_map_page(rdma->sc_cm_id->device, ctxt->pages[0], 0,
+ sndbuf->len, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(rdma->sc_cm_id->device, ctxt->sge[0].addr)) {
+ ret = -EIO;
+ goto out_unmap;
+ }
+ atomic_inc(&rdma->sc_dma_used);
+
+ memset(&send_wr, 0, sizeof(send_wr));
+ ctxt->cqe.done = svc_rdma_wc_send;
+ send_wr.wr_cqe = &ctxt->cqe;
+ send_wr.sg_list = ctxt->sge;
+ send_wr.num_sge = 1;
+ send_wr.opcode = IB_WR_SEND;
+ send_wr.send_flags = IB_SEND_SIGNALED;
+
+ ret = svc_rdma_send(rdma, &send_wr);
+ if (ret) {
+ ret = -EIO;
+ goto out_unmap;
+ }
+
+out_err:
+ svc_rdma_put_req_map(rdma, vec);
+ dprintk("svcrdma: %s returns %d\n", __func__, ret);
+ return ret;
+
+out_unmap:
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
+ goto out_err;
+}
+
+/* Server-side transport endpoint wants a whole page for its send
+ * buffer. The client RPC code constructs the RPC header in this
+ * buffer before it invokes ->send_request.
+ *
+ * Returns NULL if there was a temporary allocation failure.
+ */
+static void *
+xprt_rdma_bc_allocate(struct rpc_task *task, size_t size)
+{
+ struct rpc_rqst *rqst = task->tk_rqstp;
+ struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+ struct svcxprt_rdma *rdma;
+ struct page *page;
+
+ rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
+
+ /* Prevent an infinite loop: try to make this case work */
+ if (size > PAGE_SIZE)
+ WARN_ONCE(1, "svcrdma: large bc buffer request (size %zu)\n",
+ size);
+
+ page = alloc_page(RPCRDMA_DEF_GFP);
+ if (!page)
+ return NULL;
+
+ return page_address(page);
+}
+
+static void
+xprt_rdma_bc_free(void *buffer)
+{
+ /* No-op: ctxt and page have already been freed. */
+}
+
+static int
+rpcrdma_bc_send_request(struct svcxprt_rdma *rdma, struct rpc_rqst *rqst)
+{
+ struct rpc_xprt *xprt = rqst->rq_xprt;
+ struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
+ struct rpcrdma_msg *headerp = (struct rpcrdma_msg *)rqst->rq_buffer;
+ int rc;
+
+ /* Space in the send buffer for an RPC/RDMA header is reserved
+ * via xprt->tsh_size.
+ */
+ headerp->rm_xid = rqst->rq_xid;
+ headerp->rm_vers = rpcrdma_version;
+ headerp->rm_credit = cpu_to_be32(r_xprt->rx_buf.rb_bc_max_requests);
+ headerp->rm_type = rdma_msg;
+ headerp->rm_body.rm_chunks[0] = xdr_zero;
+ headerp->rm_body.rm_chunks[1] = xdr_zero;
+ headerp->rm_body.rm_chunks[2] = xdr_zero;
+
+#ifdef SVCRDMA_BACKCHANNEL_DEBUG
+ pr_info("%s: %*ph\n", __func__, 64, rqst->rq_buffer);
+#endif
+
+ rc = svc_rdma_bc_sendto(rdma, rqst);
+ if (rc)
+ goto drop_connection;
+ return rc;
+
+drop_connection:
+ dprintk("svcrdma: failed to send bc call\n");
+ xprt_disconnect_done(xprt);
+ return -ENOTCONN;
+}
+
+/* Send an RPC call on the passive end of a transport
+ * connection.
+ */
+static int
+xprt_rdma_bc_send_request(struct rpc_task *task)
+{
+ struct rpc_rqst *rqst = task->tk_rqstp;
+ struct svc_xprt *sxprt = rqst->rq_xprt->bc_xprt;
+ struct svcxprt_rdma *rdma;
+ int ret;
+
+ dprintk("svcrdma: sending bc call with xid: %08x\n",
+ be32_to_cpu(rqst->rq_xid));
+
+ if (!mutex_trylock(&sxprt->xpt_mutex)) {
+ rpc_sleep_on(&sxprt->xpt_bc_pending, task, NULL);
+ if (!mutex_trylock(&sxprt->xpt_mutex))
+ return -EAGAIN;
+ rpc_wake_up_queued_task(&sxprt->xpt_bc_pending, task);
+ }
+
+ ret = -ENOTCONN;
+ rdma = container_of(sxprt, struct svcxprt_rdma, sc_xprt);
+ if (!test_bit(XPT_DEAD, &sxprt->xpt_flags))
+ ret = rpcrdma_bc_send_request(rdma, rqst);
+
+ mutex_unlock(&sxprt->xpt_mutex);
+
+ if (ret < 0)
+ return ret;
+ return 0;
+}
+
+static void
+xprt_rdma_bc_close(struct rpc_xprt *xprt)
+{
+ dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+}
+
+static void
+xprt_rdma_bc_put(struct rpc_xprt *xprt)
+{
+ dprintk("svcrdma: %s: xprt %p\n", __func__, xprt);
+
+ xprt_free(xprt);
+ module_put(THIS_MODULE);
+}
+
+static struct rpc_xprt_ops xprt_rdma_bc_procs = {
+ .reserve_xprt = xprt_reserve_xprt_cong,
+ .release_xprt = xprt_release_xprt_cong,
+ .alloc_slot = xprt_alloc_slot,
+ .release_request = xprt_release_rqst_cong,
+ .buf_alloc = xprt_rdma_bc_allocate,
+ .buf_free = xprt_rdma_bc_free,
+ .send_request = xprt_rdma_bc_send_request,
+ .set_retrans_timeout = xprt_set_retrans_timeout_def,
+ .close = xprt_rdma_bc_close,
+ .destroy = xprt_rdma_bc_put,
+ .print_stats = xprt_rdma_print_stats
+};
+
+static const struct rpc_timeout xprt_rdma_bc_timeout = {
+ .to_initval = 60 * HZ,
+ .to_maxval = 60 * HZ,
+};
+
+/* It shouldn't matter if the number of backchannel session slots
+ * doesn't match the number of RPC/RDMA credits. That just means
+ * one or the other will have extra slots that aren't used.
+ */
+static struct rpc_xprt *
+xprt_setup_rdma_bc(struct xprt_create *args)
+{
+ struct rpc_xprt *xprt;
+ struct rpcrdma_xprt *new_xprt;
+
+ if (args->addrlen > sizeof(xprt->addr)) {
+ dprintk("RPC: %s: address too large\n", __func__);
+ return ERR_PTR(-EBADF);
+ }
+
+ xprt = xprt_alloc(args->net, sizeof(*new_xprt),
+ RPCRDMA_MAX_BC_REQUESTS,
+ RPCRDMA_MAX_BC_REQUESTS);
+ if (!xprt) {
+ dprintk("RPC: %s: couldn't allocate rpc_xprt\n",
+ __func__);
+ return ERR_PTR(-ENOMEM);
+ }
+
+ xprt->timeout = &xprt_rdma_bc_timeout;
+ xprt_set_bound(xprt);
+ xprt_set_connected(xprt);
+ xprt->bind_timeout = RPCRDMA_BIND_TO;
+ xprt->reestablish_timeout = RPCRDMA_INIT_REEST_TO;
+ xprt->idle_timeout = RPCRDMA_IDLE_DISC_TO;
+
+ xprt->prot = XPRT_TRANSPORT_BC_RDMA;
+ xprt->tsh_size = RPCRDMA_HDRLEN_MIN / sizeof(__be32);
+ xprt->ops = &xprt_rdma_bc_procs;
+
+ memcpy(&xprt->addr, args->dstaddr, args->addrlen);
+ xprt->addrlen = args->addrlen;
+ xprt_rdma_format_addresses(xprt, (struct sockaddr *)&xprt->addr);
+ xprt->resvport = 0;
+
+ xprt->max_payload = xprt_rdma_max_inline_read;
+
+ new_xprt = rpcx_to_rdmax(xprt);
+ new_xprt->rx_buf.rb_bc_max_requests = xprt->max_reqs;
+
+ xprt_get(xprt);
+ args->bc_xprt->xpt_bc_xprt = xprt;
+ xprt->bc_xprt = args->bc_xprt;
+
+ if (!try_module_get(THIS_MODULE))
+ goto out_fail;
+
+ /* Final put for backchannel xprt is in __svc_rdma_free */
+ xprt_get(xprt);
+ return xprt;
+
+out_fail:
+ xprt_rdma_free_addresses(xprt);
+ args->bc_xprt->xpt_bc_xprt = NULL;
+ xprt_put(xprt);
+ xprt_free(xprt);
+ return ERR_PTR(-EINVAL);
+}
+
+struct xprt_class xprt_rdma_bc = {
+ .list = LIST_HEAD_INIT(xprt_rdma_bc.list),
+ .name = "rdma backchannel",
+ .owner = THIS_MODULE,
+ .ident = XPRT_TRANSPORT_BC_RDMA,
+ .setup = xprt_setup_rdma_bc,
+};
diff --git a/net/sunrpc/xprtrdma/svc_rdma_marshal.c b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
index e2fca7617242..765bca47c74d 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_marshal.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_marshal.c
@@ -145,29 +145,44 @@ static __be32 *decode_reply_array(__be32 *va, __be32 *vaend)
return (__be32 *)&ary->wc_array[nchunks];
}
-int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
- struct svc_rqst *rqstp)
+int svc_rdma_xdr_decode_req(struct rpcrdma_msg *rmsgp, struct svc_rqst *rqstp)
{
- struct rpcrdma_msg *rmsgp = NULL;
__be32 *va, *vaend;
+ unsigned int len;
u32 hdr_len;
- rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
-
/* Verify that there's enough bytes for header + something */
- if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_MIN) {
+ if (rqstp->rq_arg.len <= RPCRDMA_HDRLEN_ERR) {
dprintk("svcrdma: header too short = %d\n",
rqstp->rq_arg.len);
return -EINVAL;
}
- if (rmsgp->rm_vers != rpcrdma_version)
- return -ENOSYS;
-
- /* Pull in the extra for the padded case and bump our pointer */
- if (rmsgp->rm_type == rdma_msgp) {
- int hdrlen;
+ if (rmsgp->rm_vers != rpcrdma_version) {
+ dprintk("%s: bad version %u\n", __func__,
+ be32_to_cpu(rmsgp->rm_vers));
+ return -EPROTONOSUPPORT;
+ }
+ switch (be32_to_cpu(rmsgp->rm_type)) {
+ case RDMA_MSG:
+ case RDMA_NOMSG:
+ break;
+
+ case RDMA_DONE:
+ /* Just drop it */
+ dprintk("svcrdma: dropping RDMA_DONE message\n");
+ return 0;
+
+ case RDMA_ERROR:
+ /* Possible if this is a backchannel reply.
+ * XXX: We should cancel this XID, though.
+ */
+ dprintk("svcrdma: dropping RDMA_ERROR message\n");
+ return 0;
+
+ case RDMA_MSGP:
+ /* Pull in the extra for the padded case, bump our pointer */
rmsgp->rm_body.rm_padded.rm_align =
be32_to_cpu(rmsgp->rm_body.rm_padded.rm_align);
rmsgp->rm_body.rm_padded.rm_thresh =
@@ -175,11 +190,15 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
va = &rmsgp->rm_body.rm_padded.rm_pempty[4];
rqstp->rq_arg.head[0].iov_base = va;
- hdrlen = (u32)((unsigned long)va - (unsigned long)rmsgp);
- rqstp->rq_arg.head[0].iov_len -= hdrlen;
- if (hdrlen > rqstp->rq_arg.len)
+ len = (u32)((unsigned long)va - (unsigned long)rmsgp);
+ rqstp->rq_arg.head[0].iov_len -= len;
+ if (len > rqstp->rq_arg.len)
return -EINVAL;
- return hdrlen;
+ return len;
+ default:
+ dprintk("svcrdma: bad rdma procedure (%u)\n",
+ be32_to_cpu(rmsgp->rm_type));
+ return -EINVAL;
}
/* The chunk list may contain either a read chunk list or a write
@@ -188,20 +207,25 @@ int svc_rdma_xdr_decode_req(struct rpcrdma_msg **rdma_req,
va = &rmsgp->rm_body.rm_chunks[0];
vaend = (__be32 *)((unsigned long)rmsgp + rqstp->rq_arg.len);
va = decode_read_list(va, vaend);
- if (!va)
+ if (!va) {
+ dprintk("svcrdma: failed to decode read list\n");
return -EINVAL;
+ }
va = decode_write_list(va, vaend);
- if (!va)
+ if (!va) {
+ dprintk("svcrdma: failed to decode write list\n");
return -EINVAL;
+ }
va = decode_reply_array(va, vaend);
- if (!va)
+ if (!va) {
+ dprintk("svcrdma: failed to decode reply chunk\n");
return -EINVAL;
+ }
rqstp->rq_arg.head[0].iov_base = va;
hdr_len = (unsigned long)va - (unsigned long)rmsgp;
rqstp->rq_arg.head[0].iov_len -= hdr_len;
- *rdma_req = rmsgp;
return hdr_len;
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
index f0c3ff67ca98..3b24a646eb46 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_recvfrom.c
@@ -126,7 +126,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
u64 rs_offset,
bool last)
{
- struct ib_send_wr read_wr;
+ struct ib_rdma_wr read_wr;
int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
int ret, read, pno;
@@ -144,6 +144,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
head->arg.page_len += len;
+
head->arg.len += len;
if (!pg_off)
head->count++;
@@ -160,8 +161,7 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
goto err;
atomic_inc(&xprt->sc_dma_used);
- /* The lkey here is either a local dma lkey or a dma_mr lkey */
- ctxt->sge[pno].lkey = xprt->sc_dma_lkey;
+ ctxt->sge[pno].lkey = xprt->sc_pd->local_dma_lkey;
ctxt->sge[pno].length = len;
ctxt->count++;
@@ -180,16 +180,16 @@ int rdma_read_chunk_lcl(struct svcxprt_rdma *xprt,
clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
memset(&read_wr, 0, sizeof(read_wr));
- read_wr.wr_id = (unsigned long)ctxt;
- read_wr.opcode = IB_WR_RDMA_READ;
- ctxt->wr_op = read_wr.opcode;
- read_wr.send_flags = IB_SEND_SIGNALED;
- read_wr.wr.rdma.rkey = rs_handle;
- read_wr.wr.rdma.remote_addr = rs_offset;
- read_wr.sg_list = ctxt->sge;
- read_wr.num_sge = pages_needed;
-
- ret = svc_rdma_send(xprt, &read_wr);
+ ctxt->cqe.done = svc_rdma_wc_read;
+ read_wr.wr.wr_cqe = &ctxt->cqe;
+ read_wr.wr.opcode = IB_WR_RDMA_READ;
+ read_wr.wr.send_flags = IB_SEND_SIGNALED;
+ read_wr.rkey = rs_handle;
+ read_wr.remote_addr = rs_offset;
+ read_wr.wr.sg_list = ctxt->sge;
+ read_wr.wr.num_sge = pages_needed;
+
+ ret = svc_rdma_send(xprt, &read_wr.wr);
if (ret) {
pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
@@ -219,14 +219,14 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
u64 rs_offset,
bool last)
{
- struct ib_send_wr read_wr;
+ struct ib_rdma_wr read_wr;
struct ib_send_wr inv_wr;
- struct ib_send_wr fastreg_wr;
+ struct ib_reg_wr reg_wr;
u8 key;
- int pages_needed = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
+ int nents = PAGE_ALIGN(*page_offset + rs_length) >> PAGE_SHIFT;
struct svc_rdma_op_ctxt *ctxt = svc_rdma_get_context(xprt);
struct svc_rdma_fastreg_mr *frmr = svc_rdma_get_frmr(xprt);
- int ret, read, pno;
+ int ret, read, pno, dma_nents, n;
u32 pg_off = *page_offset;
u32 pg_no = *page_no;
@@ -235,17 +235,14 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
ctxt->direction = DMA_FROM_DEVICE;
ctxt->frmr = frmr;
- pages_needed = min_t(int, pages_needed, xprt->sc_frmr_pg_list_len);
- read = min_t(int, (pages_needed << PAGE_SHIFT) - *page_offset,
- rs_length);
+ nents = min_t(unsigned int, nents, xprt->sc_frmr_pg_list_len);
+ read = min_t(int, (nents << PAGE_SHIFT) - *page_offset, rs_length);
- frmr->kva = page_address(rqstp->rq_arg.pages[pg_no]);
frmr->direction = DMA_FROM_DEVICE;
frmr->access_flags = (IB_ACCESS_LOCAL_WRITE|IB_ACCESS_REMOTE_WRITE);
- frmr->map_len = pages_needed << PAGE_SHIFT;
- frmr->page_list_len = pages_needed;
+ frmr->sg_nents = nents;
- for (pno = 0; pno < pages_needed; pno++) {
+ for (pno = 0; pno < nents; pno++) {
int len = min_t(int, rs_length, PAGE_SIZE - pg_off);
head->arg.pages[pg_no] = rqstp->rq_arg.pages[pg_no];
@@ -253,17 +250,12 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
head->arg.len += len;
if (!pg_off)
head->count++;
+
+ sg_set_page(&frmr->sg[pno], rqstp->rq_arg.pages[pg_no],
+ len, pg_off);
+
rqstp->rq_respages = &rqstp->rq_arg.pages[pg_no+1];
rqstp->rq_next_page = rqstp->rq_respages + 1;
- frmr->page_list->page_list[pno] =
- ib_dma_map_page(xprt->sc_cm_id->device,
- head->arg.pages[pg_no], 0,
- PAGE_SIZE, DMA_FROM_DEVICE);
- ret = ib_dma_mapping_error(xprt->sc_cm_id->device,
- frmr->page_list->page_list[pno]);
- if (ret)
- goto err;
- atomic_inc(&xprt->sc_dma_used);
/* adjust offset and wrap to next page if needed */
pg_off += len;
@@ -279,54 +271,70 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
else
clear_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags);
+ dma_nents = ib_dma_map_sg(xprt->sc_cm_id->device,
+ frmr->sg, frmr->sg_nents,
+ frmr->direction);
+ if (!dma_nents) {
+ pr_err("svcrdma: failed to dma map sg %p\n",
+ frmr->sg);
+ return -ENOMEM;
+ }
+ atomic_inc(&xprt->sc_dma_used);
+
+ n = ib_map_mr_sg(frmr->mr, frmr->sg, frmr->sg_nents, PAGE_SIZE);
+ if (unlikely(n != frmr->sg_nents)) {
+ pr_err("svcrdma: failed to map mr %p (%d/%d elements)\n",
+ frmr->mr, n, frmr->sg_nents);
+ return n < 0 ? n : -EINVAL;
+ }
+
/* Bump the key */
key = (u8)(frmr->mr->lkey & 0x000000FF);
ib_update_fast_reg_key(frmr->mr, ++key);
- ctxt->sge[0].addr = (unsigned long)frmr->kva + *page_offset;
+ ctxt->sge[0].addr = frmr->mr->iova;
ctxt->sge[0].lkey = frmr->mr->lkey;
- ctxt->sge[0].length = read;
+ ctxt->sge[0].length = frmr->mr->length;
ctxt->count = 1;
ctxt->read_hdr = head;
- /* Prepare FASTREG WR */
- memset(&fastreg_wr, 0, sizeof(fastreg_wr));
- fastreg_wr.opcode = IB_WR_FAST_REG_MR;
- fastreg_wr.send_flags = IB_SEND_SIGNALED;
- fastreg_wr.wr.fast_reg.iova_start = (unsigned long)frmr->kva;
- fastreg_wr.wr.fast_reg.page_list = frmr->page_list;
- fastreg_wr.wr.fast_reg.page_list_len = frmr->page_list_len;
- fastreg_wr.wr.fast_reg.page_shift = PAGE_SHIFT;
- fastreg_wr.wr.fast_reg.length = frmr->map_len;
- fastreg_wr.wr.fast_reg.access_flags = frmr->access_flags;
- fastreg_wr.wr.fast_reg.rkey = frmr->mr->lkey;
- fastreg_wr.next = &read_wr;
+ /* Prepare REG WR */
+ ctxt->reg_cqe.done = svc_rdma_wc_reg;
+ reg_wr.wr.wr_cqe = &ctxt->reg_cqe;
+ reg_wr.wr.opcode = IB_WR_REG_MR;
+ reg_wr.wr.send_flags = IB_SEND_SIGNALED;
+ reg_wr.wr.num_sge = 0;
+ reg_wr.mr = frmr->mr;
+ reg_wr.key = frmr->mr->lkey;
+ reg_wr.access = frmr->access_flags;
+ reg_wr.wr.next = &read_wr.wr;
/* Prepare RDMA_READ */
memset(&read_wr, 0, sizeof(read_wr));
- read_wr.send_flags = IB_SEND_SIGNALED;
- read_wr.wr.rdma.rkey = rs_handle;
- read_wr.wr.rdma.remote_addr = rs_offset;
- read_wr.sg_list = ctxt->sge;
- read_wr.num_sge = 1;
+ ctxt->cqe.done = svc_rdma_wc_read;
+ read_wr.wr.wr_cqe = &ctxt->cqe;
+ read_wr.wr.send_flags = IB_SEND_SIGNALED;
+ read_wr.rkey = rs_handle;
+ read_wr.remote_addr = rs_offset;
+ read_wr.wr.sg_list = ctxt->sge;
+ read_wr.wr.num_sge = 1;
if (xprt->sc_dev_caps & SVCRDMA_DEVCAP_READ_W_INV) {
- read_wr.opcode = IB_WR_RDMA_READ_WITH_INV;
- read_wr.wr_id = (unsigned long)ctxt;
- read_wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey;
+ read_wr.wr.opcode = IB_WR_RDMA_READ_WITH_INV;
+ read_wr.wr.ex.invalidate_rkey = ctxt->frmr->mr->lkey;
} else {
- read_wr.opcode = IB_WR_RDMA_READ;
- read_wr.next = &inv_wr;
+ read_wr.wr.opcode = IB_WR_RDMA_READ;
+ read_wr.wr.next = &inv_wr;
/* Prepare invalidate */
memset(&inv_wr, 0, sizeof(inv_wr));
- inv_wr.wr_id = (unsigned long)ctxt;
+ ctxt->inv_cqe.done = svc_rdma_wc_inv;
+ inv_wr.wr_cqe = &ctxt->inv_cqe;
inv_wr.opcode = IB_WR_LOCAL_INV;
inv_wr.send_flags = IB_SEND_SIGNALED | IB_SEND_FENCE;
inv_wr.ex.invalidate_rkey = frmr->mr->lkey;
}
- ctxt->wr_op = read_wr.opcode;
/* Post the chain */
- ret = svc_rdma_send(xprt, &fastreg_wr);
+ ret = svc_rdma_send(xprt, &reg_wr.wr);
if (ret) {
pr_err("svcrdma: Error %d posting RDMA_READ\n", ret);
set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
@@ -340,7 +348,8 @@ int rdma_read_chunk_frmr(struct svcxprt_rdma *xprt,
atomic_inc(&rdma_stat_read);
return ret;
err:
- svc_rdma_unmap_dma(ctxt);
+ ib_dma_unmap_sg(xprt->sc_cm_id->device,
+ frmr->sg, frmr->sg_nents, frmr->direction);
svc_rdma_put_context(ctxt, 0);
svc_rdma_put_frmr(xprt, frmr);
return ret;
@@ -560,6 +569,38 @@ static int rdma_read_complete(struct svc_rqst *rqstp,
return ret;
}
+/* By convention, backchannel calls arrive via rdma_msg type
+ * messages, and never populate the chunk lists. This makes
+ * the RPC/RDMA header small and fixed in size, so it is
+ * straightforward to check the RPC header's direction field.
+ */
+static bool
+svc_rdma_is_backchannel_reply(struct svc_xprt *xprt, struct rpcrdma_msg *rmsgp)
+{
+ __be32 *p = (__be32 *)rmsgp;
+
+ if (!xprt->xpt_bc_xprt)
+ return false;
+
+ if (rmsgp->rm_type != rdma_msg)
+ return false;
+ if (rmsgp->rm_body.rm_chunks[0] != xdr_zero)
+ return false;
+ if (rmsgp->rm_body.rm_chunks[1] != xdr_zero)
+ return false;
+ if (rmsgp->rm_body.rm_chunks[2] != xdr_zero)
+ return false;
+
+ /* sanity */
+ if (p[7] != rmsgp->rm_xid)
+ return false;
+ /* call direction */
+ if (p[8] == cpu_to_be32(RPC_CALL))
+ return false;
+
+ return true;
+}
+
/*
* Set up the rqstp thread context to point to the RQ buffer. If
* necessary, pull additional data from the client with an RDMA_READ
@@ -573,7 +614,6 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
struct svc_rdma_op_ctxt *ctxt = NULL;
struct rpcrdma_msg *rmsgp;
int ret = 0;
- int len;
dprintk("svcrdma: rqstp=%p\n", rqstp);
@@ -603,8 +643,7 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
* transport list
*/
if (test_bit(XPT_CLOSE, &xprt->xpt_flags))
- goto close_out;
-
+ goto defer;
goto out;
}
dprintk("svcrdma: processing ctxt=%p on xprt=%p, rqstp=%p, status=%d\n",
@@ -615,14 +654,21 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
rdma_build_arg_xdr(rqstp, ctxt, ctxt->byte_len);
/* Decode the RDMA header. */
- len = svc_rdma_xdr_decode_req(&rmsgp, rqstp);
- rqstp->rq_xprt_hlen = len;
-
- /* If the request is invalid, reply with an error */
- if (len < 0) {
- if (len == -ENOSYS)
- svc_rdma_send_error(rdma_xprt, rmsgp, ERR_VERS);
- goto close_out;
+ rmsgp = (struct rpcrdma_msg *)rqstp->rq_arg.head[0].iov_base;
+ ret = svc_rdma_xdr_decode_req(rmsgp, rqstp);
+ if (ret < 0)
+ goto out_err;
+ if (ret == 0)
+ goto out_drop;
+ rqstp->rq_xprt_hlen = ret;
+
+ if (svc_rdma_is_backchannel_reply(xprt, rmsgp)) {
+ ret = svc_rdma_handle_bc_reply(xprt->xpt_bc_xprt, rmsgp,
+ &rqstp->rq_arg);
+ svc_rdma_put_context(ctxt, 0);
+ if (ret)
+ goto repost;
+ return ret;
}
/* Read read-list data. */
@@ -650,15 +696,16 @@ int svc_rdma_recvfrom(struct svc_rqst *rqstp)
svc_xprt_copy_addrs(rqstp, xprt);
return ret;
- close_out:
- if (ctxt)
- svc_rdma_put_context(ctxt, 1);
- dprintk("svcrdma: transport %p is closing\n", xprt);
- /*
- * Set the close bit and enqueue it. svc_recv will see the
- * close bit and call svc_xprt_delete
- */
- set_bit(XPT_CLOSE, &xprt->xpt_flags);
+out_err:
+ svc_rdma_send_error(rdma_xprt, rmsgp, ret);
+ svc_rdma_put_context(ctxt, 0);
+ return 0;
+
defer:
return 0;
+
+out_drop:
+ svc_rdma_put_context(ctxt, 1);
+repost:
+ return svc_rdma_repost_recv(rdma_xprt, GFP_KERNEL);
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_sendto.c b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
index 1dfae8317065..4f1b1c4f45f9 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_sendto.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_sendto.c
@@ -50,9 +50,15 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
-static int map_xdr(struct svcxprt_rdma *xprt,
- struct xdr_buf *xdr,
- struct svc_rdma_req_map *vec)
+static u32 xdr_padsize(u32 len)
+{
+ return (len & 3) ? (4 - (len & 3)) : 0;
+}
+
+int svc_rdma_map_xdr(struct svcxprt_rdma *xprt,
+ struct xdr_buf *xdr,
+ struct svc_rdma_req_map *vec,
+ bool write_chunk_present)
{
int sge_no;
u32 sge_bytes;
@@ -62,7 +68,7 @@ static int map_xdr(struct svcxprt_rdma *xprt,
if (xdr->len !=
(xdr->head[0].iov_len + xdr->page_len + xdr->tail[0].iov_len)) {
- pr_err("svcrdma: map_xdr: XDR buffer length error\n");
+ pr_err("svcrdma: %s: XDR buffer length error\n", __func__);
return -EIO;
}
@@ -92,14 +98,25 @@ static int map_xdr(struct svcxprt_rdma *xprt,
/* Tail SGE */
if (xdr->tail[0].iov_len) {
- vec->sge[sge_no].iov_base = xdr->tail[0].iov_base;
- vec->sge[sge_no].iov_len = xdr->tail[0].iov_len;
- sge_no++;
+ unsigned char *base = xdr->tail[0].iov_base;
+ size_t len = xdr->tail[0].iov_len;
+ u32 xdr_pad = xdr_padsize(xdr->page_len);
+
+ if (write_chunk_present && xdr_pad) {
+ base += xdr_pad;
+ len -= xdr_pad;
+ }
+
+ if (len) {
+ vec->sge[sge_no].iov_base = base;
+ vec->sge[sge_no].iov_len = len;
+ sge_no++;
+ }
}
- dprintk("svcrdma: map_xdr: sge_no %d page_no %d "
+ dprintk("svcrdma: %s: sge_no %d page_no %d "
"page_base %u page_len %u head_len %zu tail_len %zu\n",
- sge_no, page_no, xdr->page_base, xdr->page_len,
+ __func__, sge_no, page_no, xdr->page_base, xdr->page_len,
xdr->head[0].iov_len, xdr->tail[0].iov_len);
vec->count = sge_no;
@@ -166,10 +183,10 @@ svc_rdma_get_write_array(struct rpcrdma_msg *rmsgp)
* reply array is present
*/
static struct rpcrdma_write_array *
-svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp)
+svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp,
+ struct rpcrdma_write_array *wr_ary)
{
struct rpcrdma_read_chunk *rch;
- struct rpcrdma_write_array *wr_ary;
struct rpcrdma_write_array *rp_ary;
/* XXX: Need to fix when reply chunk may occur with read list
@@ -191,7 +208,6 @@ svc_rdma_get_reply_array(struct rpcrdma_msg *rmsgp)
goto found_it;
}
- wr_ary = svc_rdma_get_write_array(rmsgp);
if (wr_ary) {
int chunk = be32_to_cpu(wr_ary->wc_nchunks);
@@ -217,7 +233,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
u32 xdr_off, int write_len,
struct svc_rdma_req_map *vec)
{
- struct ib_send_wr write_wr;
+ struct ib_rdma_wr write_wr;
struct ib_sge *sge;
int xdr_sge_no;
int sge_no;
@@ -265,7 +281,7 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
sge[sge_no].addr))
goto err;
atomic_inc(&xprt->sc_dma_used);
- sge[sge_no].lkey = xprt->sc_dma_lkey;
+ sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
ctxt->count++;
sge_off = 0;
sge_no++;
@@ -281,58 +297,54 @@ static int send_write(struct svcxprt_rdma *xprt, struct svc_rqst *rqstp,
/* Prepare WRITE WR */
memset(&write_wr, 0, sizeof write_wr);
- ctxt->wr_op = IB_WR_RDMA_WRITE;
- write_wr.wr_id = (unsigned long)ctxt;
- write_wr.sg_list = &sge[0];
- write_wr.num_sge = sge_no;
- write_wr.opcode = IB_WR_RDMA_WRITE;
- write_wr.send_flags = IB_SEND_SIGNALED;
- write_wr.wr.rdma.rkey = rmr;
- write_wr.wr.rdma.remote_addr = to;
+ ctxt->cqe.done = svc_rdma_wc_write;
+ write_wr.wr.wr_cqe = &ctxt->cqe;
+ write_wr.wr.sg_list = &sge[0];
+ write_wr.wr.num_sge = sge_no;
+ write_wr.wr.opcode = IB_WR_RDMA_WRITE;
+ write_wr.wr.send_flags = IB_SEND_SIGNALED;
+ write_wr.rkey = rmr;
+ write_wr.remote_addr = to;
/* Post It */
atomic_inc(&rdma_stat_write);
- if (svc_rdma_send(xprt, &write_wr))
+ if (svc_rdma_send(xprt, &write_wr.wr))
goto err;
return write_len - bc;
err:
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 0);
- /* Fatal error, close transport */
return -EIO;
}
+noinline
static int send_write_chunks(struct svcxprt_rdma *xprt,
- struct rpcrdma_msg *rdma_argp,
+ struct rpcrdma_write_array *wr_ary,
struct rpcrdma_msg *rdma_resp,
struct svc_rqst *rqstp,
struct svc_rdma_req_map *vec)
{
- u32 xfer_len = rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
+ u32 xfer_len = rqstp->rq_res.page_len;
int write_len;
u32 xdr_off;
int chunk_off;
int chunk_no;
int nchunks;
- struct rpcrdma_write_array *arg_ary;
struct rpcrdma_write_array *res_ary;
int ret;
- arg_ary = svc_rdma_get_write_array(rdma_argp);
- if (!arg_ary)
- return 0;
res_ary = (struct rpcrdma_write_array *)
&rdma_resp->rm_body.rm_chunks[1];
/* Write chunks start at the pagelist */
- nchunks = be32_to_cpu(arg_ary->wc_nchunks);
+ nchunks = be32_to_cpu(wr_ary->wc_nchunks);
for (xdr_off = rqstp->rq_res.head[0].iov_len, chunk_no = 0;
xfer_len && chunk_no < nchunks;
chunk_no++) {
struct rpcrdma_segment *arg_ch;
u64 rs_offset;
- arg_ch = &arg_ary->wc_array[chunk_no].wc_target;
+ arg_ch = &wr_ary->wc_array[chunk_no].wc_target;
write_len = min(xfer_len, be32_to_cpu(arg_ch->rs_length));
/* Prepare the response chunk given the length actually
@@ -350,11 +362,8 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
xdr_off,
write_len,
vec);
- if (ret <= 0) {
- dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
- ret);
- return -EIO;
- }
+ if (ret <= 0)
+ goto out_err;
chunk_off += ret;
xdr_off += ret;
xfer_len -= ret;
@@ -364,11 +373,16 @@ static int send_write_chunks(struct svcxprt_rdma *xprt,
/* Update the req with the number of chunks actually used */
svc_rdma_xdr_encode_write_list(rdma_resp, chunk_no);
- return rqstp->rq_res.page_len + rqstp->rq_res.tail[0].iov_len;
+ return rqstp->rq_res.page_len;
+
+out_err:
+ pr_err("svcrdma: failed to send write chunks, rc=%d\n", ret);
+ return -EIO;
}
+noinline
static int send_reply_chunks(struct svcxprt_rdma *xprt,
- struct rpcrdma_msg *rdma_argp,
+ struct rpcrdma_write_array *rp_ary,
struct rpcrdma_msg *rdma_resp,
struct svc_rqst *rqstp,
struct svc_rdma_req_map *vec)
@@ -380,25 +394,21 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
int chunk_off;
int nchunks;
struct rpcrdma_segment *ch;
- struct rpcrdma_write_array *arg_ary;
struct rpcrdma_write_array *res_ary;
int ret;
- arg_ary = svc_rdma_get_reply_array(rdma_argp);
- if (!arg_ary)
- return 0;
/* XXX: need to fix when reply lists occur with read-list and or
* write-list */
res_ary = (struct rpcrdma_write_array *)
&rdma_resp->rm_body.rm_chunks[2];
/* xdr offset starts at RPC message */
- nchunks = be32_to_cpu(arg_ary->wc_nchunks);
+ nchunks = be32_to_cpu(rp_ary->wc_nchunks);
for (xdr_off = 0, chunk_no = 0;
xfer_len && chunk_no < nchunks;
chunk_no++) {
u64 rs_offset;
- ch = &arg_ary->wc_array[chunk_no].wc_target;
+ ch = &rp_ary->wc_array[chunk_no].wc_target;
write_len = min(xfer_len, be32_to_cpu(ch->rs_length));
/* Prepare the reply chunk given the length actually
@@ -415,11 +425,8 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
xdr_off,
write_len,
vec);
- if (ret <= 0) {
- dprintk("svcrdma: RDMA_WRITE failed, ret=%d\n",
- ret);
- return -EIO;
- }
+ if (ret <= 0)
+ goto out_err;
chunk_off += ret;
xdr_off += ret;
xfer_len -= ret;
@@ -430,6 +437,10 @@ static int send_reply_chunks(struct svcxprt_rdma *xprt,
svc_rdma_xdr_encode_reply_array(res_ary, chunk_no);
return rqstp->rq_res.len;
+
+out_err:
+ pr_err("svcrdma: failed to send reply chunks, rc=%d\n", ret);
+ return -EIO;
}
/* This function prepares the portion of the RPCRDMA message to be
@@ -464,13 +475,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
int pages;
int ret;
- /* Post a recv buffer to handle another request. */
- ret = svc_rdma_post_recv(rdma);
+ ret = svc_rdma_repost_recv(rdma, GFP_KERNEL);
if (ret) {
- printk(KERN_INFO
- "svcrdma: could not post a receive buffer, err=%d."
- "Closing transport %p.\n", ret, rdma);
- set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
svc_rdma_put_context(ctxt, 0);
return -ENOTCONN;
}
@@ -480,7 +486,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->count = 1;
/* Prepare the SGE for the RPCRDMA Header */
- ctxt->sge[0].lkey = rdma->sc_dma_lkey;
+ ctxt->sge[0].lkey = rdma->sc_pd->local_dma_lkey;
ctxt->sge[0].length = svc_rdma_xdr_get_reply_hdr_len(rdma_resp);
ctxt->sge[0].addr =
ib_dma_map_page(rdma->sc_cm_id->device, page, 0,
@@ -504,7 +510,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
ctxt->sge[sge_no].addr))
goto err;
atomic_inc(&rdma->sc_dma_used);
- ctxt->sge[sge_no].lkey = rdma->sc_dma_lkey;
+ ctxt->sge[sge_no].lkey = rdma->sc_pd->local_dma_lkey;
ctxt->sge[sge_no].length = sge_bytes;
}
if (byte_count != 0) {
@@ -543,8 +549,8 @@ static int send_reply(struct svcxprt_rdma *rdma,
goto err;
}
memset(&send_wr, 0, sizeof send_wr);
- ctxt->wr_op = IB_WR_SEND;
- send_wr.wr_id = (unsigned long)ctxt;
+ ctxt->cqe.done = svc_rdma_wc_send;
+ send_wr.wr_cqe = &ctxt->cqe;
send_wr.sg_list = ctxt->sge;
send_wr.num_sge = sge_no;
send_wr.opcode = IB_WR_SEND;
@@ -559,6 +565,7 @@ static int send_reply(struct svcxprt_rdma *rdma,
err:
svc_rdma_unmap_dma(ctxt);
svc_rdma_put_context(ctxt, 1);
+ pr_err("svcrdma: failed to send reply, rc=%d\n", ret);
return -EIO;
}
@@ -573,7 +580,7 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
container_of(xprt, struct svcxprt_rdma, sc_xprt);
struct rpcrdma_msg *rdma_argp;
struct rpcrdma_msg *rdma_resp;
- struct rpcrdma_write_array *reply_ary;
+ struct rpcrdma_write_array *wr_ary, *rp_ary;
enum rpcrdma_proc reply_type;
int ret;
int inline_bytes;
@@ -587,21 +594,25 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
* places this at the start of page 0.
*/
rdma_argp = page_address(rqstp->rq_pages[0]);
+ wr_ary = svc_rdma_get_write_array(rdma_argp);
+ rp_ary = svc_rdma_get_reply_array(rdma_argp, wr_ary);
/* Build an req vec for the XDR */
ctxt = svc_rdma_get_context(rdma);
ctxt->direction = DMA_TO_DEVICE;
- vec = svc_rdma_get_req_map();
- ret = map_xdr(rdma, &rqstp->rq_res, vec);
+ vec = svc_rdma_get_req_map(rdma);
+ ret = svc_rdma_map_xdr(rdma, &rqstp->rq_res, vec, wr_ary != NULL);
if (ret)
goto err0;
inline_bytes = rqstp->rq_res.len;
/* Create the RDMA response header */
- res_page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
+ ret = -ENOMEM;
+ res_page = alloc_page(GFP_KERNEL);
+ if (!res_page)
+ goto err0;
rdma_resp = page_address(res_page);
- reply_ary = svc_rdma_get_reply_array(rdma_argp);
- if (reply_ary)
+ if (rp_ary)
reply_type = RDMA_NOMSG;
else
reply_type = RDMA_MSG;
@@ -609,35 +620,97 @@ int svc_rdma_sendto(struct svc_rqst *rqstp)
rdma_resp, reply_type);
/* Send any write-chunk data and build resp write-list */
- ret = send_write_chunks(rdma, rdma_argp, rdma_resp,
- rqstp, vec);
- if (ret < 0) {
- printk(KERN_ERR "svcrdma: failed to send write chunks, rc=%d\n",
- ret);
- goto err1;
+ if (wr_ary) {
+ ret = send_write_chunks(rdma, wr_ary, rdma_resp, rqstp, vec);
+ if (ret < 0)
+ goto err1;
+ inline_bytes -= ret + xdr_padsize(ret);
}
- inline_bytes -= ret;
/* Send any reply-list data and update resp reply-list */
- ret = send_reply_chunks(rdma, rdma_argp, rdma_resp,
- rqstp, vec);
- if (ret < 0) {
- printk(KERN_ERR "svcrdma: failed to send reply chunks, rc=%d\n",
- ret);
- goto err1;
+ if (rp_ary) {
+ ret = send_reply_chunks(rdma, rp_ary, rdma_resp, rqstp, vec);
+ if (ret < 0)
+ goto err1;
+ inline_bytes -= ret;
}
- inline_bytes -= ret;
ret = send_reply(rdma, rqstp, res_page, rdma_resp, ctxt, vec,
inline_bytes);
- svc_rdma_put_req_map(vec);
+ if (ret < 0)
+ goto err1;
+
+ svc_rdma_put_req_map(rdma, vec);
dprintk("svcrdma: send_reply returns %d\n", ret);
return ret;
err1:
put_page(res_page);
err0:
- svc_rdma_put_req_map(vec);
+ svc_rdma_put_req_map(rdma, vec);
svc_rdma_put_context(ctxt, 0);
- return ret;
+ set_bit(XPT_CLOSE, &rdma->sc_xprt.xpt_flags);
+ return -ENOTCONN;
+}
+
+void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
+ int status)
+{
+ struct ib_send_wr err_wr;
+ struct page *p;
+ struct svc_rdma_op_ctxt *ctxt;
+ enum rpcrdma_errcode err;
+ __be32 *va;
+ int length;
+ int ret;
+
+ ret = svc_rdma_repost_recv(xprt, GFP_KERNEL);
+ if (ret)
+ return;
+
+ p = alloc_page(GFP_KERNEL);
+ if (!p)
+ return;
+ va = page_address(p);
+
+ /* XDR encode an error reply */
+ err = ERR_CHUNK;
+ if (status == -EPROTONOSUPPORT)
+ err = ERR_VERS;
+ length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
+
+ ctxt = svc_rdma_get_context(xprt);
+ ctxt->direction = DMA_TO_DEVICE;
+ ctxt->count = 1;
+ ctxt->pages[0] = p;
+
+ /* Prepare SGE for local address */
+ ctxt->sge[0].lkey = xprt->sc_pd->local_dma_lkey;
+ ctxt->sge[0].length = length;
+ ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device,
+ p, 0, length, DMA_TO_DEVICE);
+ if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) {
+ dprintk("svcrdma: Error mapping buffer for protocol error\n");
+ svc_rdma_put_context(ctxt, 1);
+ return;
+ }
+ atomic_inc(&xprt->sc_dma_used);
+
+ /* Prepare SEND WR */
+ memset(&err_wr, 0, sizeof(err_wr));
+ ctxt->cqe.done = svc_rdma_wc_send;
+ err_wr.wr_cqe = &ctxt->cqe;
+ err_wr.sg_list = ctxt->sge;
+ err_wr.num_sge = 1;
+ err_wr.opcode = IB_WR_SEND;
+ err_wr.send_flags = IB_SEND_SIGNALED;
+
+ /* Post It */
+ ret = svc_rdma_send(xprt, &err_wr);
+ if (ret) {
+ dprintk("svcrdma: Error %d posting send for protocol error\n",
+ ret);
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
+ }
}
diff --git a/net/sunrpc/xprtrdma/svc_rdma_transport.c b/net/sunrpc/xprtrdma/svc_rdma_transport.c
index fcc3eb80c265..90668969d559 100644
--- a/net/sunrpc/xprtrdma/svc_rdma_transport.c
+++ b/net/sunrpc/xprtrdma/svc_rdma_transport.c
@@ -56,23 +56,17 @@
#define RPCDBG_FACILITY RPCDBG_SVCXPRT
+static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *, int);
static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
struct net *net,
struct sockaddr *sa, int salen,
int flags);
static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt);
static void svc_rdma_release_rqst(struct svc_rqst *);
-static void dto_tasklet_func(unsigned long data);
static void svc_rdma_detach(struct svc_xprt *xprt);
static void svc_rdma_free(struct svc_xprt *xprt);
static int svc_rdma_has_wspace(struct svc_xprt *xprt);
static int svc_rdma_secure_port(struct svc_rqst *);
-static void rq_cq_reap(struct svcxprt_rdma *xprt);
-static void sq_cq_reap(struct svcxprt_rdma *xprt);
-
-static DECLARE_TASKLET(dto_tasklet, dto_tasklet_func, 0UL);
-static DEFINE_SPINLOCK(dto_lock);
-static LIST_HEAD(dto_xprt_q);
static struct svc_xprt_ops svc_rdma_ops = {
.xpo_create = svc_rdma_create,
@@ -95,18 +89,133 @@ struct svc_xprt_class svc_rdma_class = {
.xcl_ident = XPRT_TRANSPORT_RDMA,
};
-struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *, struct net *,
+ struct sockaddr *, int, int);
+static void svc_rdma_bc_detach(struct svc_xprt *);
+static void svc_rdma_bc_free(struct svc_xprt *);
+
+static struct svc_xprt_ops svc_rdma_bc_ops = {
+ .xpo_create = svc_rdma_bc_create,
+ .xpo_detach = svc_rdma_bc_detach,
+ .xpo_free = svc_rdma_bc_free,
+ .xpo_prep_reply_hdr = svc_rdma_prep_reply_hdr,
+ .xpo_secure_port = svc_rdma_secure_port,
+};
+
+struct svc_xprt_class svc_rdma_bc_class = {
+ .xcl_name = "rdma-bc",
+ .xcl_owner = THIS_MODULE,
+ .xcl_ops = &svc_rdma_bc_ops,
+ .xcl_max_payload = (1024 - RPCRDMA_HDRLEN_MIN)
+};
+
+static struct svc_xprt *svc_rdma_bc_create(struct svc_serv *serv,
+ struct net *net,
+ struct sockaddr *sa, int salen,
+ int flags)
+{
+ struct svcxprt_rdma *cma_xprt;
+ struct svc_xprt *xprt;
+
+ cma_xprt = rdma_create_xprt(serv, 0);
+ if (!cma_xprt)
+ return ERR_PTR(-ENOMEM);
+ xprt = &cma_xprt->sc_xprt;
+
+ svc_xprt_init(net, &svc_rdma_bc_class, xprt, serv);
+ serv->sv_bc_xprt = xprt;
+
+ dprintk("svcrdma: %s(%p)\n", __func__, xprt);
+ return xprt;
+}
+
+static void svc_rdma_bc_detach(struct svc_xprt *xprt)
+{
+ dprintk("svcrdma: %s(%p)\n", __func__, xprt);
+}
+
+static void svc_rdma_bc_free(struct svc_xprt *xprt)
+{
+ struct svcxprt_rdma *rdma =
+ container_of(xprt, struct svcxprt_rdma, sc_xprt);
+
+ dprintk("svcrdma: %s(%p)\n", __func__, xprt);
+ if (xprt)
+ kfree(rdma);
+}
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
+static struct svc_rdma_op_ctxt *alloc_ctxt(struct svcxprt_rdma *xprt,
+ gfp_t flags)
{
struct svc_rdma_op_ctxt *ctxt;
- ctxt = kmem_cache_alloc(svc_rdma_ctxt_cachep,
- GFP_KERNEL | __GFP_NOFAIL);
- ctxt->xprt = xprt;
- INIT_LIST_HEAD(&ctxt->dto_q);
+ ctxt = kmalloc(sizeof(*ctxt), flags);
+ if (ctxt) {
+ ctxt->xprt = xprt;
+ INIT_LIST_HEAD(&ctxt->free);
+ INIT_LIST_HEAD(&ctxt->dto_q);
+ }
+ return ctxt;
+}
+
+static bool svc_rdma_prealloc_ctxts(struct svcxprt_rdma *xprt)
+{
+ unsigned int i;
+
+ /* Each RPC/RDMA credit can consume a number of send
+ * and receive WQEs. One ctxt is allocated for each.
+ */
+ i = xprt->sc_sq_depth + xprt->sc_rq_depth;
+
+ while (i--) {
+ struct svc_rdma_op_ctxt *ctxt;
+
+ ctxt = alloc_ctxt(xprt, GFP_KERNEL);
+ if (!ctxt) {
+ dprintk("svcrdma: No memory for RDMA ctxt\n");
+ return false;
+ }
+ list_add(&ctxt->free, &xprt->sc_ctxts);
+ }
+ return true;
+}
+
+struct svc_rdma_op_ctxt *svc_rdma_get_context(struct svcxprt_rdma *xprt)
+{
+ struct svc_rdma_op_ctxt *ctxt = NULL;
+
+ spin_lock_bh(&xprt->sc_ctxt_lock);
+ xprt->sc_ctxt_used++;
+ if (list_empty(&xprt->sc_ctxts))
+ goto out_empty;
+
+ ctxt = list_first_entry(&xprt->sc_ctxts,
+ struct svc_rdma_op_ctxt, free);
+ list_del_init(&ctxt->free);
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
+
+out:
ctxt->count = 0;
ctxt->frmr = NULL;
- atomic_inc(&xprt->sc_ctxt_used);
return ctxt;
+
+out_empty:
+ /* Either pre-allocation missed the mark, or send
+ * queue accounting is broken.
+ */
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
+
+ ctxt = alloc_ctxt(xprt, GFP_NOIO);
+ if (ctxt)
+ goto out;
+
+ spin_lock_bh(&xprt->sc_ctxt_lock);
+ xprt->sc_ctxt_used--;
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
+ WARN_ONCE(1, "svcrdma: empty RDMA ctxt list?\n");
+ return NULL;
}
void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
@@ -116,11 +225,11 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
for (i = 0; i < ctxt->count && ctxt->sge[i].length; i++) {
/*
* Unmap the DMA addr in the SGE if the lkey matches
- * the sc_dma_lkey, otherwise, ignore it since it is
+ * the local_dma_lkey, otherwise, ignore it since it is
* an FRMR lkey and will be unmapped later when the
* last WR that uses it completes.
*/
- if (ctxt->sge[i].lkey == xprt->sc_dma_lkey) {
+ if (ctxt->sge[i].lkey == xprt->sc_pd->local_dma_lkey) {
atomic_dec(&xprt->sc_dma_used);
ib_dma_unmap_page(xprt->sc_cm_id->device,
ctxt->sge[i].addr,
@@ -132,44 +241,108 @@ void svc_rdma_unmap_dma(struct svc_rdma_op_ctxt *ctxt)
void svc_rdma_put_context(struct svc_rdma_op_ctxt *ctxt, int free_pages)
{
- struct svcxprt_rdma *xprt;
+ struct svcxprt_rdma *xprt = ctxt->xprt;
int i;
- xprt = ctxt->xprt;
if (free_pages)
for (i = 0; i < ctxt->count; i++)
put_page(ctxt->pages[i]);
- kmem_cache_free(svc_rdma_ctxt_cachep, ctxt);
- atomic_dec(&xprt->sc_ctxt_used);
+ spin_lock_bh(&xprt->sc_ctxt_lock);
+ xprt->sc_ctxt_used--;
+ list_add(&ctxt->free, &xprt->sc_ctxts);
+ spin_unlock_bh(&xprt->sc_ctxt_lock);
}
-/*
- * Temporary NFS req mappings are shared across all transport
- * instances. These are short lived and should be bounded by the number
- * of concurrent server threads * depth of the SQ.
- */
-struct svc_rdma_req_map *svc_rdma_get_req_map(void)
+static void svc_rdma_destroy_ctxts(struct svcxprt_rdma *xprt)
+{
+ while (!list_empty(&xprt->sc_ctxts)) {
+ struct svc_rdma_op_ctxt *ctxt;
+
+ ctxt = list_first_entry(&xprt->sc_ctxts,
+ struct svc_rdma_op_ctxt, free);
+ list_del(&ctxt->free);
+ kfree(ctxt);
+ }
+}
+
+static struct svc_rdma_req_map *alloc_req_map(gfp_t flags)
{
struct svc_rdma_req_map *map;
- map = kmem_cache_alloc(svc_rdma_map_cachep,
- GFP_KERNEL | __GFP_NOFAIL);
+
+ map = kmalloc(sizeof(*map), flags);
+ if (map)
+ INIT_LIST_HEAD(&map->free);
+ return map;
+}
+
+static bool svc_rdma_prealloc_maps(struct svcxprt_rdma *xprt)
+{
+ unsigned int i;
+
+ /* One for each receive buffer on this connection. */
+ i = xprt->sc_max_requests;
+
+ while (i--) {
+ struct svc_rdma_req_map *map;
+
+ map = alloc_req_map(GFP_KERNEL);
+ if (!map) {
+ dprintk("svcrdma: No memory for request map\n");
+ return false;
+ }
+ list_add(&map->free, &xprt->sc_maps);
+ }
+ return true;
+}
+
+struct svc_rdma_req_map *svc_rdma_get_req_map(struct svcxprt_rdma *xprt)
+{
+ struct svc_rdma_req_map *map = NULL;
+
+ spin_lock(&xprt->sc_map_lock);
+ if (list_empty(&xprt->sc_maps))
+ goto out_empty;
+
+ map = list_first_entry(&xprt->sc_maps,
+ struct svc_rdma_req_map, free);
+ list_del_init(&map->free);
+ spin_unlock(&xprt->sc_map_lock);
+
+out:
map->count = 0;
return map;
+
+out_empty:
+ spin_unlock(&xprt->sc_map_lock);
+
+ /* Pre-allocation amount was incorrect */
+ map = alloc_req_map(GFP_NOIO);
+ if (map)
+ goto out;
+
+ WARN_ONCE(1, "svcrdma: empty request map list?\n");
+ return NULL;
}
-void svc_rdma_put_req_map(struct svc_rdma_req_map *map)
+void svc_rdma_put_req_map(struct svcxprt_rdma *xprt,
+ struct svc_rdma_req_map *map)
{
- kmem_cache_free(svc_rdma_map_cachep, map);
+ spin_lock(&xprt->sc_map_lock);
+ list_add(&map->free, &xprt->sc_maps);
+ spin_unlock(&xprt->sc_map_lock);
}
-/* ib_cq event handler */
-static void cq_event_handler(struct ib_event *event, void *context)
+static void svc_rdma_destroy_maps(struct svcxprt_rdma *xprt)
{
- struct svc_xprt *xprt = context;
- dprintk("svcrdma: received CQ event %s (%d), context=%p\n",
- ib_event_msg(event->event), event->event, context);
- set_bit(XPT_CLOSE, &xprt->xpt_flags);
+ while (!list_empty(&xprt->sc_maps)) {
+ struct svc_rdma_req_map *map;
+
+ map = list_first_entry(&xprt->sc_maps,
+ struct svc_rdma_req_map, free);
+ list_del(&map->free);
+ kfree(map);
+ }
}
/* QP event handler */
@@ -203,253 +376,171 @@ static void qp_event_handler(struct ib_event *event, void *context)
}
}
-/*
- * Data Transfer Operation Tasklet
+/**
+ * svc_rdma_wc_receive - Invoked by RDMA provider for each polled Receive WC
+ * @cq: completion queue
+ * @wc: completed WR
*
- * Walks a list of transports with I/O pending, removing entries as
- * they are added to the server's I/O pending list. Two bits indicate
- * if SQ, RQ, or both have I/O pending. The dto_lock is an irqsave
- * spinlock that serializes access to the transport list with the RQ
- * and SQ interrupt handlers.
*/
-static void dto_tasklet_func(unsigned long data)
+static void svc_rdma_wc_receive(struct ib_cq *cq, struct ib_wc *wc)
{
- struct svcxprt_rdma *xprt;
- unsigned long flags;
+ struct svcxprt_rdma *xprt = cq->cq_context;
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct svc_rdma_op_ctxt *ctxt;
- spin_lock_irqsave(&dto_lock, flags);
- while (!list_empty(&dto_xprt_q)) {
- xprt = list_entry(dto_xprt_q.next,
- struct svcxprt_rdma, sc_dto_q);
- list_del_init(&xprt->sc_dto_q);
- spin_unlock_irqrestore(&dto_lock, flags);
+ /* WARNING: Only wc->wr_cqe and wc->status are reliable */
+ ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
+ ctxt->wc_status = wc->status;
+ svc_rdma_unmap_dma(ctxt);
- rq_cq_reap(xprt);
- sq_cq_reap(xprt);
+ if (wc->status != IB_WC_SUCCESS)
+ goto flushed;
- svc_xprt_put(&xprt->sc_xprt);
- spin_lock_irqsave(&dto_lock, flags);
- }
- spin_unlock_irqrestore(&dto_lock, flags);
+ /* All wc fields are now known to be valid */
+ ctxt->byte_len = wc->byte_len;
+ spin_lock(&xprt->sc_rq_dto_lock);
+ list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
+ spin_unlock(&xprt->sc_rq_dto_lock);
+
+ set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+ if (test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
+ goto out;
+ svc_xprt_enqueue(&xprt->sc_xprt);
+ goto out;
+
+flushed:
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ pr_warn("svcrdma: receive: %s (%u/0x%x)\n",
+ ib_wc_status_msg(wc->status),
+ wc->status, wc->vendor_err);
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ svc_rdma_put_context(ctxt, 1);
+
+out:
+ svc_xprt_put(&xprt->sc_xprt);
}
-/*
- * Receive Queue Completion Handler
- *
- * Since an RQ completion handler is called on interrupt context, we
- * need to defer the handling of the I/O to a tasklet
- */
-static void rq_comp_handler(struct ib_cq *cq, void *cq_context)
+static void svc_rdma_send_wc_common(struct svcxprt_rdma *xprt,
+ struct ib_wc *wc,
+ const char *opname)
{
- struct svcxprt_rdma *xprt = cq_context;
- unsigned long flags;
-
- /* Guard against unconditional flush call for destroyed QP */
- if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
- return;
+ if (wc->status != IB_WC_SUCCESS)
+ goto err;
- /*
- * Set the bit regardless of whether or not it's on the list
- * because it may be on the list already due to an SQ
- * completion.
- */
- set_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags);
+out:
+ atomic_dec(&xprt->sc_sq_count);
+ wake_up(&xprt->sc_send_wait);
+ return;
+
+err:
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ if (wc->status != IB_WC_WR_FLUSH_ERR)
+ pr_err("svcrdma: %s: %s (%u/0x%x)\n",
+ opname, ib_wc_status_msg(wc->status),
+ wc->status, wc->vendor_err);
+ goto out;
+}
- /*
- * If this transport is not already on the DTO transport queue,
- * add it
- */
- spin_lock_irqsave(&dto_lock, flags);
- if (list_empty(&xprt->sc_dto_q)) {
- svc_xprt_get(&xprt->sc_xprt);
- list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
- }
- spin_unlock_irqrestore(&dto_lock, flags);
+static void svc_rdma_send_wc_common_put(struct ib_cq *cq, struct ib_wc *wc,
+ const char *opname)
+{
+ struct svcxprt_rdma *xprt = cq->cq_context;
- /* Tasklet does all the work to avoid irqsave locks. */
- tasklet_schedule(&dto_tasklet);
+ svc_rdma_send_wc_common(xprt, wc, opname);
+ svc_xprt_put(&xprt->sc_xprt);
}
-/*
- * rq_cq_reap - Process the RQ CQ.
- *
- * Take all completing WC off the CQE and enqueue the associated DTO
- * context on the dto_q for the transport.
+/**
+ * svc_rdma_wc_send - Invoked by RDMA provider for each polled Send WC
+ * @cq: completion queue
+ * @wc: completed WR
*
- * Note that caller must hold a transport reference.
*/
-static void rq_cq_reap(struct svcxprt_rdma *xprt)
+void svc_rdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
{
- int ret;
- struct ib_wc wc;
- struct svc_rdma_op_ctxt *ctxt = NULL;
-
- if (!test_and_clear_bit(RDMAXPRT_RQ_PENDING, &xprt->sc_flags))
- return;
-
- ib_req_notify_cq(xprt->sc_rq_cq, IB_CQ_NEXT_COMP);
- atomic_inc(&rdma_stat_rq_poll);
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct svc_rdma_op_ctxt *ctxt;
- while ((ret = ib_poll_cq(xprt->sc_rq_cq, 1, &wc)) > 0) {
- ctxt = (struct svc_rdma_op_ctxt *)(unsigned long)wc.wr_id;
- ctxt->wc_status = wc.status;
- ctxt->byte_len = wc.byte_len;
- svc_rdma_unmap_dma(ctxt);
- if (wc.status != IB_WC_SUCCESS) {
- /* Close the transport */
- dprintk("svcrdma: transport closing putting ctxt %p\n", ctxt);
- set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
- svc_rdma_put_context(ctxt, 1);
- svc_xprt_put(&xprt->sc_xprt);
- continue;
- }
- spin_lock_bh(&xprt->sc_rq_dto_lock);
- list_add_tail(&ctxt->dto_q, &xprt->sc_rq_dto_q);
- spin_unlock_bh(&xprt->sc_rq_dto_lock);
- svc_xprt_put(&xprt->sc_xprt);
- }
+ svc_rdma_send_wc_common_put(cq, wc, "send");
- if (ctxt)
- atomic_inc(&rdma_stat_rq_prod);
-
- set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
- /*
- * If data arrived before established event,
- * don't enqueue. This defers RPC I/O until the
- * RDMA connection is complete.
- */
- if (!test_bit(RDMAXPRT_CONN_PENDING, &xprt->sc_flags))
- svc_xprt_enqueue(&xprt->sc_xprt);
+ ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 1);
}
-/*
- * Process a completion context
+/**
+ * svc_rdma_wc_write - Invoked by RDMA provider for each polled Write WC
+ * @cq: completion queue
+ * @wc: completed WR
+ *
*/
-static void process_context(struct svcxprt_rdma *xprt,
- struct svc_rdma_op_ctxt *ctxt)
+void svc_rdma_wc_write(struct ib_cq *cq, struct ib_wc *wc)
{
- svc_rdma_unmap_dma(ctxt);
-
- switch (ctxt->wr_op) {
- case IB_WR_SEND:
- if (ctxt->frmr)
- pr_err("svcrdma: SEND: ctxt->frmr != NULL\n");
- svc_rdma_put_context(ctxt, 1);
- break;
-
- case IB_WR_RDMA_WRITE:
- if (ctxt->frmr)
- pr_err("svcrdma: WRITE: ctxt->frmr != NULL\n");
- svc_rdma_put_context(ctxt, 0);
- break;
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct svc_rdma_op_ctxt *ctxt;
- case IB_WR_RDMA_READ:
- case IB_WR_RDMA_READ_WITH_INV:
- svc_rdma_put_frmr(xprt, ctxt->frmr);
- if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
- struct svc_rdma_op_ctxt *read_hdr = ctxt->read_hdr;
- if (read_hdr) {
- spin_lock_bh(&xprt->sc_rq_dto_lock);
- set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
- list_add_tail(&read_hdr->dto_q,
- &xprt->sc_read_complete_q);
- spin_unlock_bh(&xprt->sc_rq_dto_lock);
- } else {
- pr_err("svcrdma: ctxt->read_hdr == NULL\n");
- }
- svc_xprt_enqueue(&xprt->sc_xprt);
- }
- svc_rdma_put_context(ctxt, 0);
- break;
+ svc_rdma_send_wc_common_put(cq, wc, "write");
- default:
- printk(KERN_ERR "svcrdma: unexpected completion type, "
- "opcode=%d\n",
- ctxt->wr_op);
- break;
- }
+ ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_context(ctxt, 0);
}
-/*
- * Send Queue Completion Handler - potentially called on interrupt context.
+/**
+ * svc_rdma_wc_reg - Invoked by RDMA provider for each polled FASTREG WC
+ * @cq: completion queue
+ * @wc: completed WR
*
- * Note that caller must hold a transport reference.
*/
-static void sq_cq_reap(struct svcxprt_rdma *xprt)
+void svc_rdma_wc_reg(struct ib_cq *cq, struct ib_wc *wc)
{
- struct svc_rdma_op_ctxt *ctxt = NULL;
- struct ib_wc wc_a[6];
- struct ib_wc *wc;
- struct ib_cq *cq = xprt->sc_sq_cq;
- int ret;
-
- memset(wc_a, 0, sizeof(wc_a));
-
- if (!test_and_clear_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags))
- return;
+ svc_rdma_send_wc_common_put(cq, wc, "fastreg");
+}
- ib_req_notify_cq(xprt->sc_sq_cq, IB_CQ_NEXT_COMP);
- atomic_inc(&rdma_stat_sq_poll);
- while ((ret = ib_poll_cq(cq, ARRAY_SIZE(wc_a), wc_a)) > 0) {
- int i;
+/**
+ * svc_rdma_wc_read - Invoked by RDMA provider for each polled Read WC
+ * @cq: completion queue
+ * @wc: completed WR
+ *
+ */
+void svc_rdma_wc_read(struct ib_cq *cq, struct ib_wc *wc)
+{
+ struct svcxprt_rdma *xprt = cq->cq_context;
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct svc_rdma_op_ctxt *ctxt;
- for (i = 0; i < ret; i++) {
- wc = &wc_a[i];
- if (wc->status != IB_WC_SUCCESS) {
- dprintk("svcrdma: sq wc err status %s (%d)\n",
- ib_wc_status_msg(wc->status),
- wc->status);
+ svc_rdma_send_wc_common(xprt, wc, "read");
- /* Close the transport */
- set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
- }
+ ctxt = container_of(cqe, struct svc_rdma_op_ctxt, cqe);
+ svc_rdma_unmap_dma(ctxt);
+ svc_rdma_put_frmr(xprt, ctxt->frmr);
- /* Decrement used SQ WR count */
- atomic_dec(&xprt->sc_sq_count);
- wake_up(&xprt->sc_send_wait);
+ if (test_bit(RDMACTXT_F_LAST_CTXT, &ctxt->flags)) {
+ struct svc_rdma_op_ctxt *read_hdr;
- ctxt = (struct svc_rdma_op_ctxt *)
- (unsigned long)wc->wr_id;
- if (ctxt)
- process_context(xprt, ctxt);
+ read_hdr = ctxt->read_hdr;
+ spin_lock(&xprt->sc_rq_dto_lock);
+ list_add_tail(&read_hdr->dto_q,
+ &xprt->sc_read_complete_q);
+ spin_unlock(&xprt->sc_rq_dto_lock);
- svc_xprt_put(&xprt->sc_xprt);
- }
+ set_bit(XPT_DATA, &xprt->sc_xprt.xpt_flags);
+ svc_xprt_enqueue(&xprt->sc_xprt);
}
- if (ctxt)
- atomic_inc(&rdma_stat_sq_prod);
+ svc_rdma_put_context(ctxt, 0);
+ svc_xprt_put(&xprt->sc_xprt);
}
-static void sq_comp_handler(struct ib_cq *cq, void *cq_context)
+/**
+ * svc_rdma_wc_inv - Invoked by RDMA provider for each polled LOCAL_INV WC
+ * @cq: completion queue
+ * @wc: completed WR
+ *
+ */
+void svc_rdma_wc_inv(struct ib_cq *cq, struct ib_wc *wc)
{
- struct svcxprt_rdma *xprt = cq_context;
- unsigned long flags;
-
- /* Guard against unconditional flush call for destroyed QP */
- if (atomic_read(&xprt->sc_xprt.xpt_ref.refcount)==0)
- return;
-
- /*
- * Set the bit regardless of whether or not it's on the list
- * because it may be on the list already due to an RQ
- * completion.
- */
- set_bit(RDMAXPRT_SQ_PENDING, &xprt->sc_flags);
-
- /*
- * If this transport is not already on the DTO transport queue,
- * add it
- */
- spin_lock_irqsave(&dto_lock, flags);
- if (list_empty(&xprt->sc_dto_q)) {
- svc_xprt_get(&xprt->sc_xprt);
- list_add_tail(&xprt->sc_dto_q, &dto_xprt_q);
- }
- spin_unlock_irqrestore(&dto_lock, flags);
-
- /* Tasklet does all the work to avoid irqsave locks. */
- tasklet_schedule(&dto_tasklet);
+ svc_rdma_send_wc_common_put(cq, wc, "localInv");
}
static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
@@ -465,19 +556,15 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
INIT_LIST_HEAD(&cma_xprt->sc_rq_dto_q);
INIT_LIST_HEAD(&cma_xprt->sc_read_complete_q);
INIT_LIST_HEAD(&cma_xprt->sc_frmr_q);
+ INIT_LIST_HEAD(&cma_xprt->sc_ctxts);
+ INIT_LIST_HEAD(&cma_xprt->sc_maps);
init_waitqueue_head(&cma_xprt->sc_send_wait);
spin_lock_init(&cma_xprt->sc_lock);
spin_lock_init(&cma_xprt->sc_rq_dto_lock);
spin_lock_init(&cma_xprt->sc_frmr_q_lock);
-
- cma_xprt->sc_ord = svcrdma_ord;
-
- cma_xprt->sc_max_req_size = svcrdma_max_req_size;
- cma_xprt->sc_max_requests = svcrdma_max_requests;
- cma_xprt->sc_sq_depth = svcrdma_max_requests * RPCRDMA_SQ_DEPTH_MULT;
- atomic_set(&cma_xprt->sc_sq_count, 0);
- atomic_set(&cma_xprt->sc_ctxt_used, 0);
+ spin_lock_init(&cma_xprt->sc_ctxt_lock);
+ spin_lock_init(&cma_xprt->sc_map_lock);
if (listener)
set_bit(XPT_LISTENER, &cma_xprt->sc_xprt.xpt_flags);
@@ -485,7 +572,7 @@ static struct svcxprt_rdma *rdma_create_xprt(struct svc_serv *serv,
return cma_xprt;
}
-int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
+int svc_rdma_post_recv(struct svcxprt_rdma *xprt, gfp_t flags)
{
struct ib_recv_wr recv_wr, *bad_recv_wr;
struct svc_rdma_op_ctxt *ctxt;
@@ -498,12 +585,15 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
ctxt = svc_rdma_get_context(xprt);
buflen = 0;
ctxt->direction = DMA_FROM_DEVICE;
+ ctxt->cqe.done = svc_rdma_wc_receive;
for (sge_no = 0; buflen < xprt->sc_max_req_size; sge_no++) {
if (sge_no >= xprt->sc_max_sge) {
pr_err("svcrdma: Too many sges (%d)\n", sge_no);
goto err_put_ctxt;
}
- page = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
+ page = alloc_page(flags);
+ if (!page)
+ goto err_put_ctxt;
ctxt->pages[sge_no] = page;
pa = ib_dma_map_page(xprt->sc_cm_id->device,
page, 0, PAGE_SIZE,
@@ -513,14 +603,14 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
atomic_inc(&xprt->sc_dma_used);
ctxt->sge[sge_no].addr = pa;
ctxt->sge[sge_no].length = PAGE_SIZE;
- ctxt->sge[sge_no].lkey = xprt->sc_dma_lkey;
+ ctxt->sge[sge_no].lkey = xprt->sc_pd->local_dma_lkey;
ctxt->count = sge_no + 1;
buflen += PAGE_SIZE;
}
recv_wr.next = NULL;
recv_wr.sg_list = &ctxt->sge[0];
recv_wr.num_sge = ctxt->count;
- recv_wr.wr_id = (u64)(unsigned long)ctxt;
+ recv_wr.wr_cqe = &ctxt->cqe;
svc_xprt_get(&xprt->sc_xprt);
ret = ib_post_recv(xprt->sc_qp, &recv_wr, &bad_recv_wr);
@@ -537,6 +627,21 @@ int svc_rdma_post_recv(struct svcxprt_rdma *xprt)
return -ENOMEM;
}
+int svc_rdma_repost_recv(struct svcxprt_rdma *xprt, gfp_t flags)
+{
+ int ret = 0;
+
+ ret = svc_rdma_post_recv(xprt, flags);
+ if (ret) {
+ pr_err("svcrdma: could not post a receive buffer, err=%d.\n",
+ ret);
+ pr_err("svcrdma: closing transport %p.\n", xprt);
+ set_bit(XPT_CLOSE, &xprt->sc_xprt.xpt_flags);
+ ret = -ENOTCONN;
+ }
+ return ret;
+}
+
/*
* This function handles the CONNECT_REQUEST event on a listening
* endpoint. It is passed the cma_id for the _new_ connection. The context in
@@ -692,8 +797,8 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
if (!cma_xprt)
return ERR_PTR(-ENOMEM);
- listen_id = rdma_create_id(rdma_listen_handler, cma_xprt, RDMA_PS_TCP,
- IB_QPT_RC);
+ listen_id = rdma_create_id(&init_net, rdma_listen_handler, cma_xprt,
+ RDMA_PS_TCP, IB_QPT_RC);
if (IS_ERR(listen_id)) {
ret = PTR_ERR(listen_id);
dprintk("svcrdma: rdma_create_id failed = %d\n", ret);
@@ -732,7 +837,7 @@ static struct svc_xprt *svc_rdma_create(struct svc_serv *serv,
static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
{
struct ib_mr *mr;
- struct ib_fast_reg_page_list *pl;
+ struct scatterlist *sg;
struct svc_rdma_fastreg_mr *frmr;
u32 num_sg;
@@ -745,13 +850,14 @@ static struct svc_rdma_fastreg_mr *rdma_alloc_frmr(struct svcxprt_rdma *xprt)
if (IS_ERR(mr))
goto err_free_frmr;
- pl = ib_alloc_fast_reg_page_list(xprt->sc_cm_id->device,
- num_sg);
- if (IS_ERR(pl))
+ sg = kcalloc(RPCSVC_MAXPAGES, sizeof(*sg), GFP_KERNEL);
+ if (!sg)
goto err_free_mr;
+ sg_init_table(sg, RPCSVC_MAXPAGES);
+
frmr->mr = mr;
- frmr->page_list = pl;
+ frmr->sg = sg;
INIT_LIST_HEAD(&frmr->frmr_list);
return frmr;
@@ -771,8 +877,8 @@ static void rdma_dealloc_frmr_q(struct svcxprt_rdma *xprt)
frmr = list_entry(xprt->sc_frmr_q.next,
struct svc_rdma_fastreg_mr, frmr_list);
list_del_init(&frmr->frmr_list);
+ kfree(frmr->sg);
ib_dereg_mr(frmr->mr);
- ib_free_fast_reg_page_list(frmr->page_list);
kfree(frmr);
}
}
@@ -786,8 +892,7 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
frmr = list_entry(rdma->sc_frmr_q.next,
struct svc_rdma_fastreg_mr, frmr_list);
list_del_init(&frmr->frmr_list);
- frmr->map_len = 0;
- frmr->page_list_len = 0;
+ frmr->sg_nents = 0;
}
spin_unlock_bh(&rdma->sc_frmr_q_lock);
if (frmr)
@@ -796,25 +901,13 @@ struct svc_rdma_fastreg_mr *svc_rdma_get_frmr(struct svcxprt_rdma *rdma)
return rdma_alloc_frmr(rdma);
}
-static void frmr_unmap_dma(struct svcxprt_rdma *xprt,
- struct svc_rdma_fastreg_mr *frmr)
-{
- int page_no;
- for (page_no = 0; page_no < frmr->page_list_len; page_no++) {
- dma_addr_t addr = frmr->page_list->page_list[page_no];
- if (ib_dma_mapping_error(frmr->mr->device, addr))
- continue;
- atomic_dec(&xprt->sc_dma_used);
- ib_dma_unmap_page(frmr->mr->device, addr, PAGE_SIZE,
- frmr->direction);
- }
-}
-
void svc_rdma_put_frmr(struct svcxprt_rdma *rdma,
struct svc_rdma_fastreg_mr *frmr)
{
if (frmr) {
- frmr_unmap_dma(rdma, frmr);
+ ib_dma_unmap_sg(rdma->sc_cm_id->device,
+ frmr->sg, frmr->sg_nents, frmr->direction);
+ atomic_dec(&rdma->sc_dma_used);
spin_lock_bh(&rdma->sc_frmr_q_lock);
WARN_ON_ONCE(!list_empty(&frmr->frmr_list));
list_add(&frmr->frmr_list, &rdma->sc_frmr_q);
@@ -838,13 +931,10 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
struct svcxprt_rdma *listen_rdma;
struct svcxprt_rdma *newxprt = NULL;
struct rdma_conn_param conn_param;
- struct ib_cq_init_attr cq_attr = {};
struct ib_qp_init_attr qp_attr;
- struct ib_device_attr devattr;
- int uninitialized_var(dma_mr_acc);
- int need_dma_mr = 0;
- int ret;
- int i;
+ struct ib_device *dev;
+ unsigned int i;
+ int ret = 0;
listen_rdma = container_of(xprt, struct svcxprt_rdma, sc_xprt);
clear_bit(XPT_CONN, &xprt->xpt_flags);
@@ -864,51 +954,48 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
dprintk("svcrdma: newxprt from accept queue = %p, cm_id=%p\n",
newxprt, newxprt->sc_cm_id);
- ret = ib_query_device(newxprt->sc_cm_id->device, &devattr);
- if (ret) {
- dprintk("svcrdma: could not query device attributes on "
- "device %p, rc=%d\n", newxprt->sc_cm_id->device, ret);
- goto errout;
- }
+ dev = newxprt->sc_cm_id->device;
/* Qualify the transport resource defaults with the
* capabilities of this particular device */
- newxprt->sc_max_sge = min((size_t)devattr.max_sge,
+ newxprt->sc_max_sge = min((size_t)dev->attrs.max_sge,
(size_t)RPCSVC_MAXPAGES);
- newxprt->sc_max_sge_rd = min_t(size_t, devattr.max_sge_rd,
+ newxprt->sc_max_sge_rd = min_t(size_t, dev->attrs.max_sge_rd,
RPCSVC_MAXPAGES);
- newxprt->sc_max_requests = min((size_t)devattr.max_qp_wr,
- (size_t)svcrdma_max_requests);
- newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_max_requests;
+ newxprt->sc_max_req_size = svcrdma_max_req_size;
+ newxprt->sc_max_requests = min_t(u32, dev->attrs.max_qp_wr,
+ svcrdma_max_requests);
+ newxprt->sc_max_bc_requests = min_t(u32, dev->attrs.max_qp_wr,
+ svcrdma_max_bc_requests);
+ newxprt->sc_rq_depth = newxprt->sc_max_requests +
+ newxprt->sc_max_bc_requests;
+ newxprt->sc_sq_depth = RPCRDMA_SQ_DEPTH_MULT * newxprt->sc_rq_depth;
+
+ if (!svc_rdma_prealloc_ctxts(newxprt))
+ goto errout;
+ if (!svc_rdma_prealloc_maps(newxprt))
+ goto errout;
/*
* Limit ORD based on client limit, local device limit, and
* configured svcrdma limit.
*/
- newxprt->sc_ord = min_t(size_t, devattr.max_qp_rd_atom, newxprt->sc_ord);
+ newxprt->sc_ord = min_t(size_t, dev->attrs.max_qp_rd_atom, newxprt->sc_ord);
newxprt->sc_ord = min_t(size_t, svcrdma_ord, newxprt->sc_ord);
- newxprt->sc_pd = ib_alloc_pd(newxprt->sc_cm_id->device);
+ newxprt->sc_pd = ib_alloc_pd(dev);
if (IS_ERR(newxprt->sc_pd)) {
dprintk("svcrdma: error creating PD for connect request\n");
goto errout;
}
- cq_attr.cqe = newxprt->sc_sq_depth;
- newxprt->sc_sq_cq = ib_create_cq(newxprt->sc_cm_id->device,
- sq_comp_handler,
- cq_event_handler,
- newxprt,
- &cq_attr);
+ newxprt->sc_sq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_sq_depth,
+ 0, IB_POLL_SOFTIRQ);
if (IS_ERR(newxprt->sc_sq_cq)) {
dprintk("svcrdma: error creating SQ CQ for connect request\n");
goto errout;
}
- cq_attr.cqe = newxprt->sc_max_requests;
- newxprt->sc_rq_cq = ib_create_cq(newxprt->sc_cm_id->device,
- rq_comp_handler,
- cq_event_handler,
- newxprt,
- &cq_attr);
+ newxprt->sc_rq_cq = ib_alloc_cq(dev, newxprt, newxprt->sc_rq_depth,
+ 0, IB_POLL_SOFTIRQ);
if (IS_ERR(newxprt->sc_rq_cq)) {
dprintk("svcrdma: error creating RQ CQ for connect request\n");
goto errout;
@@ -918,7 +1005,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
qp_attr.event_handler = qp_event_handler;
qp_attr.qp_context = &newxprt->sc_xprt;
qp_attr.cap.max_send_wr = newxprt->sc_sq_depth;
- qp_attr.cap.max_recv_wr = newxprt->sc_max_requests;
+ qp_attr.cap.max_recv_wr = newxprt->sc_rq_depth;
qp_attr.cap.max_send_sge = newxprt->sc_max_sge;
qp_attr.cap.max_recv_sge = newxprt->sc_max_sge;
qp_attr.sq_sig_type = IB_SIGNAL_REQ_WR;
@@ -932,7 +1019,7 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
" cap.max_send_sge = %d\n"
" cap.max_recv_sge = %d\n",
newxprt->sc_cm_id, newxprt->sc_pd,
- newxprt->sc_cm_id->device, newxprt->sc_pd->device,
+ dev, newxprt->sc_pd->device,
qp_attr.cap.max_send_wr,
qp_attr.cap.max_recv_wr,
qp_attr.cap.max_send_sge,
@@ -968,9 +1055,9 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
* of an RDMA_READ. IB does not.
*/
newxprt->sc_reader = rdma_read_chunk_lcl;
- if (devattr.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
+ if (dev->attrs.device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) {
newxprt->sc_frmr_pg_list_len =
- devattr.max_fast_reg_page_list_len;
+ dev->attrs.max_fast_reg_page_list_len;
newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_FAST_REG;
newxprt->sc_reader = rdma_read_chunk_frmr;
}
@@ -978,44 +1065,16 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
/*
* Determine if a DMA MR is required and if so, what privs are required
*/
- if (!rdma_protocol_iwarp(newxprt->sc_cm_id->device,
- newxprt->sc_cm_id->port_num) &&
- !rdma_ib_or_roce(newxprt->sc_cm_id->device,
- newxprt->sc_cm_id->port_num))
+ if (!rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num) &&
+ !rdma_ib_or_roce(dev, newxprt->sc_cm_id->port_num))
goto errout;
- if (!(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG) ||
- !(devattr.device_cap_flags & IB_DEVICE_LOCAL_DMA_LKEY)) {
- need_dma_mr = 1;
- dma_mr_acc = IB_ACCESS_LOCAL_WRITE;
- if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
- newxprt->sc_cm_id->port_num) &&
- !(newxprt->sc_dev_caps & SVCRDMA_DEVCAP_FAST_REG))
- dma_mr_acc |= IB_ACCESS_REMOTE_WRITE;
- }
-
- if (rdma_protocol_iwarp(newxprt->sc_cm_id->device,
- newxprt->sc_cm_id->port_num))
+ if (rdma_protocol_iwarp(dev, newxprt->sc_cm_id->port_num))
newxprt->sc_dev_caps |= SVCRDMA_DEVCAP_READ_W_INV;
- /* Create the DMA MR if needed, otherwise, use the DMA LKEY */
- if (need_dma_mr) {
- /* Register all of physical memory */
- newxprt->sc_phys_mr =
- ib_get_dma_mr(newxprt->sc_pd, dma_mr_acc);
- if (IS_ERR(newxprt->sc_phys_mr)) {
- dprintk("svcrdma: Failed to create DMA MR ret=%d\n",
- ret);
- goto errout;
- }
- newxprt->sc_dma_lkey = newxprt->sc_phys_mr->lkey;
- } else
- newxprt->sc_dma_lkey =
- newxprt->sc_cm_id->device->local_dma_lkey;
-
/* Post receive buffers */
- for (i = 0; i < newxprt->sc_max_requests; i++) {
- ret = svc_rdma_post_recv(newxprt);
+ for (i = 0; i < newxprt->sc_rq_depth; i++) {
+ ret = svc_rdma_post_recv(newxprt, GFP_KERNEL);
if (ret) {
dprintk("svcrdma: failure posting receive buffers\n");
goto errout;
@@ -1025,13 +1084,6 @@ static struct svc_xprt *svc_rdma_accept(struct svc_xprt *xprt)
/* Swap out the handler */
newxprt->sc_cm_id->event_handler = rdma_cma_handler;
- /*
- * Arm the CQs for the SQ and RQ before accepting so we can't
- * miss the first message
- */
- ib_req_notify_cq(newxprt->sc_sq_cq, IB_CQ_NEXT_COMP);
- ib_req_notify_cq(newxprt->sc_rq_cq, IB_CQ_NEXT_COMP);
-
/* Accept Connection */
set_bit(RDMAXPRT_CONN_PENDING, &newxprt->sc_flags);
memset(&conn_param, 0, sizeof conn_param);
@@ -1114,12 +1166,14 @@ static void __svc_rdma_free(struct work_struct *work)
{
struct svcxprt_rdma *rdma =
container_of(work, struct svcxprt_rdma, sc_work);
- dprintk("svcrdma: svc_rdma_free(%p)\n", rdma);
+ struct svc_xprt *xprt = &rdma->sc_xprt;
+
+ dprintk("svcrdma: %s(%p)\n", __func__, rdma);
/* We should only be called from kref_put */
- if (atomic_read(&rdma->sc_xprt.xpt_ref.refcount) != 0)
+ if (atomic_read(&xprt->xpt_ref.refcount) != 0)
pr_err("svcrdma: sc_xprt still in use? (%d)\n",
- atomic_read(&rdma->sc_xprt.xpt_ref.refcount));
+ atomic_read(&xprt->xpt_ref.refcount));
/*
* Destroy queued, but not processed read completions. Note
@@ -1147,28 +1201,32 @@ static void __svc_rdma_free(struct work_struct *work)
}
/* Warn if we leaked a resource or under-referenced */
- if (atomic_read(&rdma->sc_ctxt_used) != 0)
+ if (rdma->sc_ctxt_used != 0)
pr_err("svcrdma: ctxt still in use? (%d)\n",
- atomic_read(&rdma->sc_ctxt_used));
+ rdma->sc_ctxt_used);
if (atomic_read(&rdma->sc_dma_used) != 0)
pr_err("svcrdma: dma still in use? (%d)\n",
atomic_read(&rdma->sc_dma_used));
- /* De-allocate fastreg mr */
+ /* Final put of backchannel client transport */
+ if (xprt->xpt_bc_xprt) {
+ xprt_put(xprt->xpt_bc_xprt);
+ xprt->xpt_bc_xprt = NULL;
+ }
+
rdma_dealloc_frmr_q(rdma);
+ svc_rdma_destroy_ctxts(rdma);
+ svc_rdma_destroy_maps(rdma);
/* Destroy the QP if present (not a listener) */
if (rdma->sc_qp && !IS_ERR(rdma->sc_qp))
ib_destroy_qp(rdma->sc_qp);
if (rdma->sc_sq_cq && !IS_ERR(rdma->sc_sq_cq))
- ib_destroy_cq(rdma->sc_sq_cq);
+ ib_free_cq(rdma->sc_sq_cq);
if (rdma->sc_rq_cq && !IS_ERR(rdma->sc_rq_cq))
- ib_destroy_cq(rdma->sc_rq_cq);
-
- if (rdma->sc_phys_mr && !IS_ERR(rdma->sc_phys_mr))
- ib_dereg_mr(rdma->sc_phys_mr);
+ ib_free_cq(rdma->sc_rq_cq);
if (rdma->sc_pd && !IS_ERR(rdma->sc_pd))
ib_dealloc_pd(rdma->sc_pd);
@@ -1229,9 +1287,6 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
spin_unlock_bh(&xprt->sc_lock);
atomic_inc(&rdma_stat_sq_starve);
- /* See if we can opportunistically reap SQ WR to make room */
- sq_cq_reap(xprt);
-
/* Wait until SQ WR available if SQ still full */
wait_event(xprt->sc_send_wait,
atomic_read(&xprt->sc_sq_count) <
@@ -1264,55 +1319,3 @@ int svc_rdma_send(struct svcxprt_rdma *xprt, struct ib_send_wr *wr)
}
return ret;
}
-
-void svc_rdma_send_error(struct svcxprt_rdma *xprt, struct rpcrdma_msg *rmsgp,
- enum rpcrdma_errcode err)
-{
- struct ib_send_wr err_wr;
- struct page *p;
- struct svc_rdma_op_ctxt *ctxt;
- __be32 *va;
- int length;
- int ret;
-
- p = alloc_page(GFP_KERNEL | __GFP_NOFAIL);
- va = page_address(p);
-
- /* XDR encode error */
- length = svc_rdma_xdr_encode_error(xprt, rmsgp, err, va);
-
- ctxt = svc_rdma_get_context(xprt);
- ctxt->direction = DMA_FROM_DEVICE;
- ctxt->count = 1;
- ctxt->pages[0] = p;
-
- /* Prepare SGE for local address */
- ctxt->sge[0].addr = ib_dma_map_page(xprt->sc_cm_id->device,
- p, 0, length, DMA_FROM_DEVICE);
- if (ib_dma_mapping_error(xprt->sc_cm_id->device, ctxt->sge[0].addr)) {
- put_page(p);
- svc_rdma_put_context(ctxt, 1);
- return;
- }
- atomic_inc(&xprt->sc_dma_used);
- ctxt->sge[0].lkey = xprt->sc_dma_lkey;
- ctxt->sge[0].length = length;
-
- /* Prepare SEND WR */
- memset(&err_wr, 0, sizeof err_wr);
- ctxt->wr_op = IB_WR_SEND;
- err_wr.wr_id = (unsigned long)ctxt;
- err_wr.sg_list = ctxt->sge;
- err_wr.num_sge = 1;
- err_wr.opcode = IB_WR_SEND;
- err_wr.send_flags = IB_SEND_SIGNALED;
-
- /* Post It */
- ret = svc_rdma_send(xprt, &err_wr);
- if (ret) {
- dprintk("svcrdma: Error %d posting send for protocol error\n",
- ret);
- svc_rdma_unmap_dma(ctxt);
- svc_rdma_put_context(ctxt, 1);
- }
-}
diff --git a/net/sunrpc/xprtrdma/transport.c b/net/sunrpc/xprtrdma/transport.c
index 41e452bc580c..b1b009f10ea3 100644
--- a/net/sunrpc/xprtrdma/transport.c
+++ b/net/sunrpc/xprtrdma/transport.c
@@ -63,7 +63,7 @@
*/
static unsigned int xprt_rdma_slot_table_entries = RPCRDMA_DEF_SLOT_TABLE;
-static unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
+unsigned int xprt_rdma_max_inline_read = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_max_inline_write = RPCRDMA_DEF_INLINE;
static unsigned int xprt_rdma_inline_write_padding;
static unsigned int xprt_rdma_memreg_strategy = RPCRDMA_FRMR;
@@ -143,12 +143,7 @@ static struct ctl_table sunrpc_table[] = {
#endif
-#define RPCRDMA_BIND_TO (60U * HZ)
-#define RPCRDMA_INIT_REEST_TO (5U * HZ)
-#define RPCRDMA_MAX_REEST_TO (30U * HZ)
-#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
-
-static struct rpc_xprt_ops xprt_rdma_procs; /* forward reference */
+static struct rpc_xprt_ops xprt_rdma_procs; /*forward reference */
static void
xprt_rdma_format_addresses4(struct rpc_xprt *xprt, struct sockaddr *sap)
@@ -174,7 +169,7 @@ xprt_rdma_format_addresses6(struct rpc_xprt *xprt, struct sockaddr *sap)
xprt->address_strings[RPC_DISPLAY_NETID] = RPCBIND_NETID_RDMA6;
}
-static void
+void
xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
{
char buf[128];
@@ -203,7 +198,7 @@ xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap)
xprt->address_strings[RPC_DISPLAY_PROTO] = "rdma";
}
-static void
+void
xprt_rdma_free_addresses(struct rpc_xprt *xprt)
{
unsigned int i;
@@ -499,7 +494,7 @@ xprt_rdma_allocate(struct rpc_task *task, size_t size)
if (req == NULL)
return NULL;
- flags = GFP_NOIO | __GFP_NOWARN;
+ flags = RPCRDMA_DEF_GFP;
if (RPC_IS_SWAPPER(task))
flags = __GFP_MEMALLOC | GFP_NOWAIT | __GFP_NOWARN;
@@ -576,6 +571,9 @@ xprt_rdma_free(void *buffer)
rb = container_of(buffer, struct rpcrdma_regbuf, rg_base[0]);
req = rb->rg_owner;
+ if (req->rl_backchannel)
+ return;
+
r_xprt = container_of(req->rl_buffer, struct rpcrdma_xprt, rx_buf);
dprintk("RPC: %s: called on 0x%p\n", __func__, req->rl_reply);
@@ -639,7 +637,7 @@ drop_connection:
return -ENOTCONN; /* implies disconnect */
}
-static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
+void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
{
struct rpcrdma_xprt *r_xprt = rpcx_to_rdmax(xprt);
long idle_time = 0;
@@ -676,7 +674,7 @@ static void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq)
static int
xprt_rdma_enable_swap(struct rpc_xprt *xprt)
{
- return -EINVAL;
+ return 0;
}
static void
@@ -705,7 +703,13 @@ static struct rpc_xprt_ops xprt_rdma_procs = {
.print_stats = xprt_rdma_print_stats,
.enable_swap = xprt_rdma_enable_swap,
.disable_swap = xprt_rdma_disable_swap,
- .inject_disconnect = xprt_rdma_inject_disconnect
+ .inject_disconnect = xprt_rdma_inject_disconnect,
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+ .bc_setup = xprt_rdma_bc_setup,
+ .bc_up = xprt_rdma_bc_up,
+ .bc_free_rqst = xprt_rdma_bc_free_rqst,
+ .bc_destroy = xprt_rdma_bc_destroy,
+#endif
};
static struct xprt_class xprt_rdma = {
@@ -732,7 +736,13 @@ void xprt_rdma_cleanup(void)
dprintk("RPC: %s: xprt_unregister returned %i\n",
__func__, rc);
+ rpcrdma_destroy_wq();
frwr_destroy_recovery_wq();
+
+ rc = xprt_unregister_transport(&xprt_rdma_bc);
+ if (rc)
+ dprintk("RPC: %s: xprt_unregister(bc) returned %i\n",
+ __func__, rc);
}
int xprt_rdma_init(void)
@@ -743,8 +753,23 @@ int xprt_rdma_init(void)
if (rc)
return rc;
+ rc = rpcrdma_alloc_wq();
+ if (rc) {
+ frwr_destroy_recovery_wq();
+ return rc;
+ }
+
rc = xprt_register_transport(&xprt_rdma);
if (rc) {
+ rpcrdma_destroy_wq();
+ frwr_destroy_recovery_wq();
+ return rc;
+ }
+
+ rc = xprt_register_transport(&xprt_rdma_bc);
+ if (rc) {
+ xprt_unregister_transport(&xprt_rdma);
+ rpcrdma_destroy_wq();
frwr_destroy_recovery_wq();
return rc;
}
diff --git a/net/sunrpc/xprtrdma/verbs.c b/net/sunrpc/xprtrdma/verbs.c
index 5502d4dade74..f5ed9f982cd7 100644
--- a/net/sunrpc/xprtrdma/verbs.c
+++ b/net/sunrpc/xprtrdma/verbs.c
@@ -68,47 +68,33 @@
* internal functions
*/
-/*
- * handle replies in tasklet context, using a single, global list
- * rdma tasklet function -- just turn around and call the func
- * for all replies on the list
- */
+static struct workqueue_struct *rpcrdma_receive_wq;
-static DEFINE_SPINLOCK(rpcrdma_tk_lock_g);
-static LIST_HEAD(rpcrdma_tasklets_g);
-
-static void
-rpcrdma_run_tasklet(unsigned long data)
+int
+rpcrdma_alloc_wq(void)
{
- struct rpcrdma_rep *rep;
- unsigned long flags;
-
- data = data;
- spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
- while (!list_empty(&rpcrdma_tasklets_g)) {
- rep = list_entry(rpcrdma_tasklets_g.next,
- struct rpcrdma_rep, rr_list);
- list_del(&rep->rr_list);
- spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+ struct workqueue_struct *recv_wq;
- rpcrdma_reply_handler(rep);
+ recv_wq = alloc_workqueue("xprtrdma_receive",
+ WQ_MEM_RECLAIM | WQ_UNBOUND | WQ_HIGHPRI,
+ 0);
+ if (!recv_wq)
+ return -ENOMEM;
- spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
- }
- spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
+ rpcrdma_receive_wq = recv_wq;
+ return 0;
}
-static DECLARE_TASKLET(rpcrdma_tasklet_g, rpcrdma_run_tasklet, 0UL);
-
-static void
-rpcrdma_schedule_tasklet(struct list_head *sched_list)
+void
+rpcrdma_destroy_wq(void)
{
- unsigned long flags;
+ struct workqueue_struct *wq;
- spin_lock_irqsave(&rpcrdma_tk_lock_g, flags);
- list_splice_tail(sched_list, &rpcrdma_tasklets_g);
- spin_unlock_irqrestore(&rpcrdma_tk_lock_g, flags);
- tasklet_schedule(&rpcrdma_tasklet_g);
+ if (rpcrdma_receive_wq) {
+ wq = rpcrdma_receive_wq;
+ rpcrdma_receive_wq = NULL;
+ destroy_workqueue(wq);
+ }
}
static void
@@ -126,98 +112,65 @@ rpcrdma_qp_async_error_upcall(struct ib_event *event, void *context)
}
}
+/**
+ * rpcrdma_wc_send - Invoked by RDMA provider for each polled Send WC
+ * @cq: completion queue (ignored)
+ * @wc: completed WR
+ *
+ */
static void
-rpcrdma_cq_async_error_upcall(struct ib_event *event, void *context)
+rpcrdma_wc_send(struct ib_cq *cq, struct ib_wc *wc)
{
- struct rpcrdma_ep *ep = context;
-
- pr_err("RPC: %s: %s on device %s ep %p\n",
- __func__, ib_event_msg(event->event),
- event->device->name, context);
- if (ep->rep_connected == 1) {
- ep->rep_connected = -EIO;
- rpcrdma_conn_func(ep);
- wake_up_all(&ep->rep_connect_wait);
- }
+ /* WARNING: Only wr_cqe and status are reliable at this point */
+ if (wc->status != IB_WC_SUCCESS && wc->status != IB_WC_WR_FLUSH_ERR)
+ pr_err("rpcrdma: Send: %s (%u/0x%x)\n",
+ ib_wc_status_msg(wc->status),
+ wc->status, wc->vendor_err);
}
static void
-rpcrdma_sendcq_process_wc(struct ib_wc *wc)
+rpcrdma_receive_worker(struct work_struct *work)
{
- /* WARNING: Only wr_id and status are reliable at this point */
- if (wc->wr_id == RPCRDMA_IGNORE_COMPLETION) {
- if (wc->status != IB_WC_SUCCESS &&
- wc->status != IB_WC_WR_FLUSH_ERR)
- pr_err("RPC: %s: SEND: %s\n",
- __func__, ib_wc_status_msg(wc->status));
- } else {
- struct rpcrdma_mw *r;
+ struct rpcrdma_rep *rep =
+ container_of(work, struct rpcrdma_rep, rr_work);
- r = (struct rpcrdma_mw *)(unsigned long)wc->wr_id;
- r->mw_sendcompletion(wc);
- }
+ rpcrdma_reply_handler(rep);
}
-static int
-rpcrdma_sendcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
-{
- struct ib_wc *wcs;
- int budget, count, rc;
-
- budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
- do {
- wcs = ep->rep_send_wcs;
-
- rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
- if (rc <= 0)
- return rc;
-
- count = rc;
- while (count-- > 0)
- rpcrdma_sendcq_process_wc(wcs++);
- } while (rc == RPCRDMA_POLLSIZE && --budget);
- return 0;
-}
-
-/*
- * Handle send, fast_reg_mr, and local_inv completions.
- *
- * Send events are typically suppressed and thus do not result
- * in an upcall. Occasionally one is signaled, however. This
- * prevents the provider's completion queue from wrapping and
- * losing a completion.
+/* Perform basic sanity checking to avoid using garbage
+ * to update the credit grant value.
*/
static void
-rpcrdma_sendcq_upcall(struct ib_cq *cq, void *cq_context)
+rpcrdma_update_granted_credits(struct rpcrdma_rep *rep)
{
- struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
- int rc;
+ struct rpcrdma_msg *rmsgp = rdmab_to_msg(rep->rr_rdmabuf);
+ struct rpcrdma_buffer *buffer = &rep->rr_rxprt->rx_buf;
+ u32 credits;
- rc = rpcrdma_sendcq_poll(cq, ep);
- if (rc) {
- dprintk("RPC: %s: ib_poll_cq failed: %i\n",
- __func__, rc);
+ if (rep->rr_len < RPCRDMA_HDRLEN_ERR)
return;
- }
- rc = ib_req_notify_cq(cq,
- IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
- if (rc == 0)
- return;
- if (rc < 0) {
- dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
- __func__, rc);
- return;
- }
+ credits = be32_to_cpu(rmsgp->rm_credit);
+ if (credits == 0)
+ credits = 1; /* don't deadlock */
+ else if (credits > buffer->rb_max_requests)
+ credits = buffer->rb_max_requests;
- rpcrdma_sendcq_poll(cq, ep);
+ atomic_set(&buffer->rb_credits, credits);
}
+/**
+ * rpcrdma_receive_wc - Invoked by RDMA provider for each polled Receive WC
+ * @cq: completion queue (ignored)
+ * @wc: completed WR
+ *
+ */
static void
-rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
+rpcrdma_receive_wc(struct ib_cq *cq, struct ib_wc *wc)
{
- struct rpcrdma_rep *rep =
- (struct rpcrdma_rep *)(unsigned long)wc->wr_id;
+ struct ib_cqe *cqe = wc->wr_cqe;
+ struct rpcrdma_rep *rep = container_of(cqe, struct rpcrdma_rep,
+ rr_cqe);
/* WARNING: Only wr_id and status are reliable at this point */
if (wc->status != IB_WC_SUCCESS)
@@ -234,96 +187,29 @@ rpcrdma_recvcq_process_wc(struct ib_wc *wc, struct list_head *sched_list)
ib_dma_sync_single_for_cpu(rep->rr_device,
rdmab_addr(rep->rr_rdmabuf),
rep->rr_len, DMA_FROM_DEVICE);
- prefetch(rdmab_to_msg(rep->rr_rdmabuf));
+
+ rpcrdma_update_granted_credits(rep);
out_schedule:
- list_add_tail(&rep->rr_list, sched_list);
+ queue_work(rpcrdma_receive_wq, &rep->rr_work);
return;
+
out_fail:
if (wc->status != IB_WC_WR_FLUSH_ERR)
- pr_err("RPC: %s: rep %p: %s\n",
- __func__, rep, ib_wc_status_msg(wc->status));
- rep->rr_len = ~0U;
+ pr_err("rpcrdma: Recv: %s (%u/0x%x)\n",
+ ib_wc_status_msg(wc->status),
+ wc->status, wc->vendor_err);
+ rep->rr_len = RPCRDMA_BAD_LEN;
goto out_schedule;
}
-static int
-rpcrdma_recvcq_poll(struct ib_cq *cq, struct rpcrdma_ep *ep)
-{
- struct list_head sched_list;
- struct ib_wc *wcs;
- int budget, count, rc;
-
- INIT_LIST_HEAD(&sched_list);
- budget = RPCRDMA_WC_BUDGET / RPCRDMA_POLLSIZE;
- do {
- wcs = ep->rep_recv_wcs;
-
- rc = ib_poll_cq(cq, RPCRDMA_POLLSIZE, wcs);
- if (rc <= 0)
- goto out_schedule;
-
- count = rc;
- while (count-- > 0)
- rpcrdma_recvcq_process_wc(wcs++, &sched_list);
- } while (rc == RPCRDMA_POLLSIZE && --budget);
- rc = 0;
-
-out_schedule:
- rpcrdma_schedule_tasklet(&sched_list);
- return rc;
-}
-
-/*
- * Handle receive completions.
- *
- * It is reentrant but processes single events in order to maintain
- * ordering of receives to keep server credits.
- *
- * It is the responsibility of the scheduled tasklet to return
- * recv buffers to the pool. NOTE: this affects synchronization of
- * connection shutdown. That is, the structures required for
- * the completion of the reply handler must remain intact until
- * all memory has been reclaimed.
- */
-static void
-rpcrdma_recvcq_upcall(struct ib_cq *cq, void *cq_context)
-{
- struct rpcrdma_ep *ep = (struct rpcrdma_ep *)cq_context;
- int rc;
-
- rc = rpcrdma_recvcq_poll(cq, ep);
- if (rc) {
- dprintk("RPC: %s: ib_poll_cq failed: %i\n",
- __func__, rc);
- return;
- }
-
- rc = ib_req_notify_cq(cq,
- IB_CQ_NEXT_COMP | IB_CQ_REPORT_MISSED_EVENTS);
- if (rc == 0)
- return;
- if (rc < 0) {
- dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
- __func__, rc);
- return;
- }
-
- rpcrdma_recvcq_poll(cq, ep);
-}
-
static void
rpcrdma_flush_cqs(struct rpcrdma_ep *ep)
{
struct ib_wc wc;
- LIST_HEAD(sched_list);
while (ib_poll_cq(ep->rep_attr.recv_cq, 1, &wc) > 0)
- rpcrdma_recvcq_process_wc(&wc, &sched_list);
- if (!list_empty(&sched_list))
- rpcrdma_schedule_tasklet(&sched_list);
- while (ib_poll_cq(ep->rep_attr.send_cq, 1, &wc) > 0)
- rpcrdma_sendcq_process_wc(&wc);
+ rpcrdma_receive_wc(NULL, &wc);
}
static int
@@ -384,6 +270,7 @@ rpcrdma_conn_upcall(struct rdma_cm_id *id, struct rdma_cm_event *event)
connected:
dprintk("RPC: %s: %sconnected\n",
__func__, connstate > 0 ? "" : "dis");
+ atomic_set(&xprt->rx_buf.rb_credits, 1);
ep->rep_connected = connstate;
rpcrdma_conn_func(ep);
wake_up_all(&ep->rep_connect_wait);
@@ -432,7 +319,8 @@ rpcrdma_create_id(struct rpcrdma_xprt *xprt,
init_completion(&ia->ri_done);
- id = rdma_create_id(rpcrdma_conn_upcall, xprt, RDMA_PS_TCP, IB_QPT_RC);
+ id = rdma_create_id(&init_net, rpcrdma_conn_upcall, xprt, RDMA_PS_TCP,
+ IB_QPT_RC);
if (IS_ERR(id)) {
rc = PTR_ERR(id);
dprintk("RPC: %s: rdma_create_id() failed %i\n",
@@ -515,7 +403,6 @@ int
rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
{
struct rpcrdma_ia *ia = &xprt->rx_ia;
- struct ib_device_attr *devattr = &ia->ri_devattr;
int rc;
ia->ri_dma_mr = NULL;
@@ -535,16 +422,10 @@ rpcrdma_ia_open(struct rpcrdma_xprt *xprt, struct sockaddr *addr, int memreg)
goto out2;
}
- rc = ib_query_device(ia->ri_device, devattr);
- if (rc) {
- dprintk("RPC: %s: ib_query_device failed %d\n",
- __func__, rc);
- goto out3;
- }
-
if (memreg == RPCRDMA_FRMR) {
- if (!(devattr->device_cap_flags & IB_DEVICE_MEM_MGT_EXTENSIONS) ||
- (devattr->max_fast_reg_page_list_len == 0)) {
+ if (!(ia->ri_device->attrs.device_cap_flags &
+ IB_DEVICE_MEM_MGT_EXTENSIONS) ||
+ (ia->ri_device->attrs.max_fast_reg_page_list_len == 0)) {
dprintk("RPC: %s: FRMR registration "
"not supported by HCA\n", __func__);
memreg = RPCRDMA_MTHCAFMR;
@@ -619,29 +500,37 @@ int
rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
struct rpcrdma_create_data_internal *cdata)
{
- struct ib_device_attr *devattr = &ia->ri_devattr;
struct ib_cq *sendcq, *recvcq;
- struct ib_cq_init_attr cq_attr = {};
- int rc, err;
+ unsigned int max_qp_wr;
+ int rc;
- if (devattr->max_sge < RPCRDMA_MAX_IOVS) {
+ if (ia->ri_device->attrs.max_sge < RPCRDMA_MAX_IOVS) {
dprintk("RPC: %s: insufficient sge's available\n",
__func__);
return -ENOMEM;
}
+ if (ia->ri_device->attrs.max_qp_wr <= RPCRDMA_BACKWARD_WRS) {
+ dprintk("RPC: %s: insufficient wqe's available\n",
+ __func__);
+ return -ENOMEM;
+ }
+ max_qp_wr = ia->ri_device->attrs.max_qp_wr - RPCRDMA_BACKWARD_WRS;
+
/* check provider's send/recv wr limits */
- if (cdata->max_requests > devattr->max_qp_wr)
- cdata->max_requests = devattr->max_qp_wr;
+ if (cdata->max_requests > max_qp_wr)
+ cdata->max_requests = max_qp_wr;
ep->rep_attr.event_handler = rpcrdma_qp_async_error_upcall;
ep->rep_attr.qp_context = ep;
ep->rep_attr.srq = NULL;
ep->rep_attr.cap.max_send_wr = cdata->max_requests;
+ ep->rep_attr.cap.max_send_wr += RPCRDMA_BACKWARD_WRS;
rc = ia->ri_ops->ro_open(ia, ep, cdata);
if (rc)
return rc;
ep->rep_attr.cap.max_recv_wr = cdata->max_requests;
+ ep->rep_attr.cap.max_recv_wr += RPCRDMA_BACKWARD_WRS;
ep->rep_attr.cap.max_send_sge = RPCRDMA_MAX_IOVS;
ep->rep_attr.cap.max_recv_sge = 1;
ep->rep_attr.cap.max_inline_data = 0;
@@ -659,17 +548,15 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
/* set trigger for requesting send completion */
ep->rep_cqinit = ep->rep_attr.cap.max_send_wr/2 - 1;
- if (ep->rep_cqinit > RPCRDMA_MAX_UNSIGNALED_SENDS)
- ep->rep_cqinit = RPCRDMA_MAX_UNSIGNALED_SENDS;
- else if (ep->rep_cqinit <= 2)
- ep->rep_cqinit = 0;
+ if (ep->rep_cqinit <= 2)
+ ep->rep_cqinit = 0; /* always signal? */
INIT_CQCOUNT(ep);
init_waitqueue_head(&ep->rep_connect_wait);
INIT_DELAYED_WORK(&ep->rep_connect_worker, rpcrdma_connect_worker);
- cq_attr.cqe = ep->rep_attr.cap.max_send_wr + 1;
- sendcq = ib_create_cq(ia->ri_device, rpcrdma_sendcq_upcall,
- rpcrdma_cq_async_error_upcall, ep, &cq_attr);
+ sendcq = ib_alloc_cq(ia->ri_device, NULL,
+ ep->rep_attr.cap.max_send_wr + 1,
+ 0, IB_POLL_SOFTIRQ);
if (IS_ERR(sendcq)) {
rc = PTR_ERR(sendcq);
dprintk("RPC: %s: failed to create send CQ: %i\n",
@@ -677,16 +564,9 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
goto out1;
}
- rc = ib_req_notify_cq(sendcq, IB_CQ_NEXT_COMP);
- if (rc) {
- dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
- __func__, rc);
- goto out2;
- }
-
- cq_attr.cqe = ep->rep_attr.cap.max_recv_wr + 1;
- recvcq = ib_create_cq(ia->ri_device, rpcrdma_recvcq_upcall,
- rpcrdma_cq_async_error_upcall, ep, &cq_attr);
+ recvcq = ib_alloc_cq(ia->ri_device, NULL,
+ ep->rep_attr.cap.max_recv_wr + 1,
+ 0, IB_POLL_SOFTIRQ);
if (IS_ERR(recvcq)) {
rc = PTR_ERR(recvcq);
dprintk("RPC: %s: failed to create recv CQ: %i\n",
@@ -694,14 +574,6 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
goto out2;
}
- rc = ib_req_notify_cq(recvcq, IB_CQ_NEXT_COMP);
- if (rc) {
- dprintk("RPC: %s: ib_req_notify_cq failed: %i\n",
- __func__, rc);
- ib_destroy_cq(recvcq);
- goto out2;
- }
-
ep->rep_attr.send_cq = sendcq;
ep->rep_attr.recv_cq = recvcq;
@@ -713,11 +585,11 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
/* Client offers RDMA Read but does not initiate */
ep->rep_remote_cma.initiator_depth = 0;
- if (devattr->max_qp_rd_atom > 32) /* arbitrary but <= 255 */
+ if (ia->ri_device->attrs.max_qp_rd_atom > 32) /* arbitrary but <= 255 */
ep->rep_remote_cma.responder_resources = 32;
else
ep->rep_remote_cma.responder_resources =
- devattr->max_qp_rd_atom;
+ ia->ri_device->attrs.max_qp_rd_atom;
ep->rep_remote_cma.retry_count = 7;
ep->rep_remote_cma.flow_control = 0;
@@ -726,10 +598,7 @@ rpcrdma_ep_create(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia,
return 0;
out2:
- err = ib_destroy_cq(sendcq);
- if (err)
- dprintk("RPC: %s: ib_destroy_cq returned %i\n",
- __func__, err);
+ ib_free_cq(sendcq);
out1:
if (ia->ri_dma_mr)
ib_dereg_mr(ia->ri_dma_mr);
@@ -764,15 +633,8 @@ rpcrdma_ep_destroy(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
ia->ri_id->qp = NULL;
}
- rc = ib_destroy_cq(ep->rep_attr.recv_cq);
- if (rc)
- dprintk("RPC: %s: ib_destroy_cq returned %i\n",
- __func__, rc);
-
- rc = ib_destroy_cq(ep->rep_attr.send_cq);
- if (rc)
- dprintk("RPC: %s: ib_destroy_cq returned %i\n",
- __func__, rc);
+ ib_free_cq(ep->rep_attr.recv_cq);
+ ib_free_cq(ep->rep_attr.send_cq);
if (ia->ri_dma_mr) {
rc = ib_dereg_mr(ia->ri_dma_mr);
@@ -885,7 +747,22 @@ retry:
}
rc = ep->rep_connected;
} else {
+ struct rpcrdma_xprt *r_xprt;
+ unsigned int extras;
+
dprintk("RPC: %s: connected\n", __func__);
+
+ r_xprt = container_of(ia, struct rpcrdma_xprt, rx_ia);
+ extras = r_xprt->rx_buf.rb_bc_srv_max_requests;
+
+ if (extras) {
+ rc = rpcrdma_ep_post_extra_recv(r_xprt, extras);
+ if (rc) {
+ pr_warn("%s: rpcrdma_ep_post_extra_recv: %i\n",
+ __func__, rc);
+ rc = 0;
+ }
+ }
}
out:
@@ -922,20 +799,26 @@ rpcrdma_ep_disconnect(struct rpcrdma_ep *ep, struct rpcrdma_ia *ia)
}
}
-static struct rpcrdma_req *
+struct rpcrdma_req *
rpcrdma_create_req(struct rpcrdma_xprt *r_xprt)
{
+ struct rpcrdma_buffer *buffer = &r_xprt->rx_buf;
struct rpcrdma_req *req;
req = kzalloc(sizeof(*req), GFP_KERNEL);
if (req == NULL)
return ERR_PTR(-ENOMEM);
+ INIT_LIST_HEAD(&req->rl_free);
+ spin_lock(&buffer->rb_reqslock);
+ list_add(&req->rl_all, &buffer->rb_allreqs);
+ spin_unlock(&buffer->rb_reqslock);
+ req->rl_cqe.done = rpcrdma_wc_send;
req->rl_buffer = &r_xprt->rx_buf;
return req;
}
-static struct rpcrdma_rep *
+struct rpcrdma_rep *
rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
@@ -956,7 +839,9 @@ rpcrdma_create_rep(struct rpcrdma_xprt *r_xprt)
}
rep->rr_device = ia->ri_device;
+ rep->rr_cqe.done = rpcrdma_receive_wc;
rep->rr_rxprt = r_xprt;
+ INIT_WORK(&rep->rr_work, rpcrdma_receive_worker);
return rep;
out_free:
@@ -970,44 +855,22 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
{
struct rpcrdma_buffer *buf = &r_xprt->rx_buf;
struct rpcrdma_ia *ia = &r_xprt->rx_ia;
- struct rpcrdma_create_data_internal *cdata = &r_xprt->rx_data;
- char *p;
- size_t len;
int i, rc;
- buf->rb_max_requests = cdata->max_requests;
+ buf->rb_max_requests = r_xprt->rx_data.max_requests;
+ buf->rb_bc_srv_max_requests = 0;
spin_lock_init(&buf->rb_lock);
-
- /* Need to allocate:
- * 1. arrays for send and recv pointers
- * 2. arrays of struct rpcrdma_req to fill in pointers
- * 3. array of struct rpcrdma_rep for replies
- * Send/recv buffers in req/rep need to be registered
- */
- len = buf->rb_max_requests *
- (sizeof(struct rpcrdma_req *) + sizeof(struct rpcrdma_rep *));
-
- p = kzalloc(len, GFP_KERNEL);
- if (p == NULL) {
- dprintk("RPC: %s: req_t/rep_t/pad kzalloc(%zd) failed\n",
- __func__, len);
- rc = -ENOMEM;
- goto out;
- }
- buf->rb_pool = p; /* for freeing it later */
-
- buf->rb_send_bufs = (struct rpcrdma_req **) p;
- p = (char *) &buf->rb_send_bufs[buf->rb_max_requests];
- buf->rb_recv_bufs = (struct rpcrdma_rep **) p;
- p = (char *) &buf->rb_recv_bufs[buf->rb_max_requests];
+ atomic_set(&buf->rb_credits, 1);
rc = ia->ri_ops->ro_init(r_xprt);
if (rc)
goto out;
+ INIT_LIST_HEAD(&buf->rb_send_bufs);
+ INIT_LIST_HEAD(&buf->rb_allreqs);
+ spin_lock_init(&buf->rb_reqslock);
for (i = 0; i < buf->rb_max_requests; i++) {
struct rpcrdma_req *req;
- struct rpcrdma_rep *rep;
req = rpcrdma_create_req(r_xprt);
if (IS_ERR(req)) {
@@ -1016,7 +879,13 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
rc = PTR_ERR(req);
goto out;
}
- buf->rb_send_bufs[i] = req;
+ req->rl_backchannel = false;
+ list_add(&req->rl_free, &buf->rb_send_bufs);
+ }
+
+ INIT_LIST_HEAD(&buf->rb_recv_bufs);
+ for (i = 0; i < buf->rb_max_requests + 2; i++) {
+ struct rpcrdma_rep *rep;
rep = rpcrdma_create_rep(r_xprt);
if (IS_ERR(rep)) {
@@ -1025,7 +894,7 @@ rpcrdma_buffer_create(struct rpcrdma_xprt *r_xprt)
rc = PTR_ERR(rep);
goto out;
}
- buf->rb_recv_bufs[i] = rep;
+ list_add(&rep->rr_list, &buf->rb_recv_bufs);
}
return 0;
@@ -1034,22 +903,38 @@ out:
return rc;
}
+static struct rpcrdma_req *
+rpcrdma_buffer_get_req_locked(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_req *req;
+
+ req = list_first_entry(&buf->rb_send_bufs,
+ struct rpcrdma_req, rl_free);
+ list_del(&req->rl_free);
+ return req;
+}
+
+static struct rpcrdma_rep *
+rpcrdma_buffer_get_rep_locked(struct rpcrdma_buffer *buf)
+{
+ struct rpcrdma_rep *rep;
+
+ rep = list_first_entry(&buf->rb_recv_bufs,
+ struct rpcrdma_rep, rr_list);
+ list_del(&rep->rr_list);
+ return rep;
+}
+
static void
rpcrdma_destroy_rep(struct rpcrdma_ia *ia, struct rpcrdma_rep *rep)
{
- if (!rep)
- return;
-
rpcrdma_free_regbuf(ia, rep->rr_rdmabuf);
kfree(rep);
}
-static void
+void
rpcrdma_destroy_req(struct rpcrdma_ia *ia, struct rpcrdma_req *req)
{
- if (!req)
- return;
-
rpcrdma_free_regbuf(ia, req->rl_sendbuf);
rpcrdma_free_regbuf(ia, req->rl_rdmabuf);
kfree(req);
@@ -1059,25 +944,29 @@ void
rpcrdma_buffer_destroy(struct rpcrdma_buffer *buf)
{
struct rpcrdma_ia *ia = rdmab_to_ia(buf);
- int i;
- /* clean up in reverse order from create
- * 1. recv mr memory (mr free, then kfree)
- * 2. send mr memory (mr free, then kfree)
- * 3. MWs
- */
- dprintk("RPC: %s: entering\n", __func__);
+ while (!list_empty(&buf->rb_recv_bufs)) {
+ struct rpcrdma_rep *rep;
- for (i = 0; i < buf->rb_max_requests; i++) {
- if (buf->rb_recv_bufs)
- rpcrdma_destroy_rep(ia, buf->rb_recv_bufs[i]);
- if (buf->rb_send_bufs)
- rpcrdma_destroy_req(ia, buf->rb_send_bufs[i]);
+ rep = rpcrdma_buffer_get_rep_locked(buf);
+ rpcrdma_destroy_rep(ia, rep);
}
- ia->ri_ops->ro_destroy(buf);
+ spin_lock(&buf->rb_reqslock);
+ while (!list_empty(&buf->rb_allreqs)) {
+ struct rpcrdma_req *req;
- kfree(buf->rb_pool);
+ req = list_first_entry(&buf->rb_allreqs,
+ struct rpcrdma_req, rl_all);
+ list_del(&req->rl_all);
+
+ spin_unlock(&buf->rb_reqslock);
+ rpcrdma_destroy_req(ia, req);
+ spin_lock(&buf->rb_reqslock);
+ }
+ spin_unlock(&buf->rb_reqslock);
+
+ ia->ri_ops->ro_destroy(buf);
}
struct rpcrdma_mw *
@@ -1109,53 +998,34 @@ rpcrdma_put_mw(struct rpcrdma_xprt *r_xprt, struct rpcrdma_mw *mw)
spin_unlock(&buf->rb_mwlock);
}
-static void
-rpcrdma_buffer_put_sendbuf(struct rpcrdma_req *req, struct rpcrdma_buffer *buf)
-{
- buf->rb_send_bufs[--buf->rb_send_index] = req;
- req->rl_niovs = 0;
- if (req->rl_reply) {
- buf->rb_recv_bufs[--buf->rb_recv_index] = req->rl_reply;
- req->rl_reply = NULL;
- }
-}
-
/*
* Get a set of request/reply buffers.
*
- * Reply buffer (if needed) is attached to send buffer upon return.
- * Rule:
- * rb_send_index and rb_recv_index MUST always be pointing to the
- * *next* available buffer (non-NULL). They are incremented after
- * removing buffers, and decremented *before* returning them.
+ * Reply buffer (if available) is attached to send buffer upon return.
*/
struct rpcrdma_req *
rpcrdma_buffer_get(struct rpcrdma_buffer *buffers)
{
struct rpcrdma_req *req;
- unsigned long flags;
-
- spin_lock_irqsave(&buffers->rb_lock, flags);
- if (buffers->rb_send_index == buffers->rb_max_requests) {
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
- dprintk("RPC: %s: out of request buffers\n", __func__);
- return ((struct rpcrdma_req *)NULL);
- }
-
- req = buffers->rb_send_bufs[buffers->rb_send_index];
- if (buffers->rb_send_index < buffers->rb_recv_index) {
- dprintk("RPC: %s: %d extra receives outstanding (ok)\n",
- __func__,
- buffers->rb_recv_index - buffers->rb_send_index);
- req->rl_reply = NULL;
- } else {
- req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
- buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
- }
- buffers->rb_send_bufs[buffers->rb_send_index++] = NULL;
+ spin_lock(&buffers->rb_lock);
+ if (list_empty(&buffers->rb_send_bufs))
+ goto out_reqbuf;
+ req = rpcrdma_buffer_get_req_locked(buffers);
+ if (list_empty(&buffers->rb_recv_bufs))
+ goto out_repbuf;
+ req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
+ spin_unlock(&buffers->rb_lock);
+ return req;
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
+out_reqbuf:
+ spin_unlock(&buffers->rb_lock);
+ pr_warn("RPC: %s: out of request buffers\n", __func__);
+ return NULL;
+out_repbuf:
+ spin_unlock(&buffers->rb_lock);
+ pr_warn("RPC: %s: out of reply buffers\n", __func__);
+ req->rl_reply = NULL;
return req;
}
@@ -1167,30 +1037,31 @@ void
rpcrdma_buffer_put(struct rpcrdma_req *req)
{
struct rpcrdma_buffer *buffers = req->rl_buffer;
- unsigned long flags;
+ struct rpcrdma_rep *rep = req->rl_reply;
+
+ req->rl_niovs = 0;
+ req->rl_reply = NULL;
- spin_lock_irqsave(&buffers->rb_lock, flags);
- rpcrdma_buffer_put_sendbuf(req, buffers);
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ spin_lock(&buffers->rb_lock);
+ list_add_tail(&req->rl_free, &buffers->rb_send_bufs);
+ if (rep)
+ list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+ spin_unlock(&buffers->rb_lock);
}
/*
* Recover reply buffers from pool.
- * This happens when recovering from error conditions.
- * Post-increment counter/array index.
+ * This happens when recovering from disconnect.
*/
void
rpcrdma_recv_buffer_get(struct rpcrdma_req *req)
{
struct rpcrdma_buffer *buffers = req->rl_buffer;
- unsigned long flags;
- spin_lock_irqsave(&buffers->rb_lock, flags);
- if (buffers->rb_recv_index < buffers->rb_max_requests) {
- req->rl_reply = buffers->rb_recv_bufs[buffers->rb_recv_index];
- buffers->rb_recv_bufs[buffers->rb_recv_index++] = NULL;
- }
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ spin_lock(&buffers->rb_lock);
+ if (!list_empty(&buffers->rb_recv_bufs))
+ req->rl_reply = rpcrdma_buffer_get_rep_locked(buffers);
+ spin_unlock(&buffers->rb_lock);
}
/*
@@ -1201,11 +1072,10 @@ void
rpcrdma_recv_buffer_put(struct rpcrdma_rep *rep)
{
struct rpcrdma_buffer *buffers = &rep->rr_rxprt->rx_buf;
- unsigned long flags;
- spin_lock_irqsave(&buffers->rb_lock, flags);
- buffers->rb_recv_bufs[--buffers->rb_recv_index] = rep;
- spin_unlock_irqrestore(&buffers->rb_lock, flags);
+ spin_lock(&buffers->rb_lock);
+ list_add_tail(&rep->rr_list, &buffers->rb_recv_bufs);
+ spin_unlock(&buffers->rb_lock);
}
/*
@@ -1307,7 +1177,7 @@ rpcrdma_ep_post(struct rpcrdma_ia *ia,
}
send_wr.next = NULL;
- send_wr.wr_id = RPCRDMA_IGNORE_COMPLETION;
+ send_wr.wr_cqe = &req->rl_cqe;
send_wr.sg_list = iov;
send_wr.num_sge = req->rl_niovs;
send_wr.opcode = IB_WR_SEND;
@@ -1345,7 +1215,7 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
int rc;
recv_wr.next = NULL;
- recv_wr.wr_id = (u64) (unsigned long) rep;
+ recv_wr.wr_cqe = &rep->rr_cqe;
recv_wr.sg_list = &rep->rr_rdmabuf->rg_iov;
recv_wr.num_sge = 1;
@@ -1362,6 +1232,46 @@ rpcrdma_ep_post_recv(struct rpcrdma_ia *ia,
return rc;
}
+/**
+ * rpcrdma_ep_post_extra_recv - Post buffers for incoming backchannel requests
+ * @r_xprt: transport associated with these backchannel resources
+ * @min_reqs: minimum number of incoming requests expected
+ *
+ * Returns zero if all requested buffers were posted, or a negative errno.
+ */
+int
+rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *r_xprt, unsigned int count)
+{
+ struct rpcrdma_buffer *buffers = &r_xprt->rx_buf;
+ struct rpcrdma_ia *ia = &r_xprt->rx_ia;
+ struct rpcrdma_ep *ep = &r_xprt->rx_ep;
+ struct rpcrdma_rep *rep;
+ int rc;
+
+ while (count--) {
+ spin_lock(&buffers->rb_lock);
+ if (list_empty(&buffers->rb_recv_bufs))
+ goto out_reqbuf;
+ rep = rpcrdma_buffer_get_rep_locked(buffers);
+ spin_unlock(&buffers->rb_lock);
+
+ rc = rpcrdma_ep_post_recv(ia, ep, rep);
+ if (rc)
+ goto out_rc;
+ }
+
+ return 0;
+
+out_reqbuf:
+ spin_unlock(&buffers->rb_lock);
+ pr_warn("%s: no extra receive buffers\n", __func__);
+ return -ENOMEM;
+
+out_rc:
+ rpcrdma_recv_buffer_put(rep);
+ return rc;
+}
+
/* How many chunk list items fit within our inline buffers?
*/
unsigned int
diff --git a/net/sunrpc/xprtrdma/xprt_rdma.h b/net/sunrpc/xprtrdma/xprt_rdma.h
index c09414e6f91b..2ebc743cb96f 100644
--- a/net/sunrpc/xprtrdma/xprt_rdma.h
+++ b/net/sunrpc/xprtrdma/xprt_rdma.h
@@ -55,6 +55,11 @@
#define RDMA_RESOLVE_TIMEOUT (5000) /* 5 seconds */
#define RDMA_CONNECT_RETRY_MAX (2) /* retries if no listener backlog */
+#define RPCRDMA_BIND_TO (60U * HZ)
+#define RPCRDMA_INIT_REEST_TO (5U * HZ)
+#define RPCRDMA_MAX_REEST_TO (30U * HZ)
+#define RPCRDMA_IDLE_DISC_TO (5U * 60 * HZ)
+
/*
* Interface Adapter -- one per transport instance
*/
@@ -68,7 +73,6 @@ struct rpcrdma_ia {
struct completion ri_done;
int ri_async_rc;
unsigned int ri_max_frmr_depth;
- struct ib_device_attr ri_devattr;
struct ib_qp_attr ri_qp_attr;
struct ib_qp_init_attr ri_qp_init_attr;
};
@@ -77,9 +81,6 @@ struct rpcrdma_ia {
* RDMA Endpoint -- one per transport instance
*/
-#define RPCRDMA_WC_BUDGET (128)
-#define RPCRDMA_POLLSIZE (16)
-
struct rpcrdma_ep {
atomic_t rep_cqcount;
int rep_cqinit;
@@ -89,22 +90,20 @@ struct rpcrdma_ep {
struct rdma_conn_param rep_remote_cma;
struct sockaddr_storage rep_remote_addr;
struct delayed_work rep_connect_worker;
- struct ib_wc rep_send_wcs[RPCRDMA_POLLSIZE];
- struct ib_wc rep_recv_wcs[RPCRDMA_POLLSIZE];
};
-/*
- * Force a signaled SEND Work Request every so often,
- * in case the provider needs to do some housekeeping.
- */
-#define RPCRDMA_MAX_UNSIGNALED_SENDS (32)
-
#define INIT_CQCOUNT(ep) atomic_set(&(ep)->rep_cqcount, (ep)->rep_cqinit)
#define DECR_CQCOUNT(ep) atomic_sub_return(1, &(ep)->rep_cqcount)
-/* Force completion handler to ignore the signal
+/* Pre-allocate extra Work Requests for handling backward receives
+ * and sends. This is a fixed value because the Work Queues are
+ * allocated when the forward channel is set up.
*/
-#define RPCRDMA_IGNORE_COMPLETION (0ULL)
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+#define RPCRDMA_BACKWARD_WRS (8)
+#else
+#define RPCRDMA_BACKWARD_WRS (0)
+#endif
/* Registered buffer -- registered kmalloc'd memory for RDMA SEND/RECV
*
@@ -143,6 +142,8 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
return (struct rpcrdma_msg *)rb->rg_base;
}
+#define RPCRDMA_DEF_GFP (GFP_NOIO | __GFP_NOWARN)
+
/*
* struct rpcrdma_rep -- this structure encapsulates state required to recv
* and complete a reply, asychronously. It needs several pieces of
@@ -166,13 +167,17 @@ rdmab_to_msg(struct rpcrdma_regbuf *rb)
struct rpcrdma_buffer;
struct rpcrdma_rep {
+ struct ib_cqe rr_cqe;
unsigned int rr_len;
struct ib_device *rr_device;
struct rpcrdma_xprt *rr_rxprt;
+ struct work_struct rr_work;
struct list_head rr_list;
struct rpcrdma_regbuf *rr_rdmabuf;
};
+#define RPCRDMA_BAD_LEN (~0U)
+
/*
* struct rpcrdma_mw - external memory region metadata
*
@@ -193,11 +198,18 @@ enum rpcrdma_frmr_state {
};
struct rpcrdma_frmr {
- struct ib_fast_reg_page_list *fr_pgl;
+ struct scatterlist *sg;
+ int sg_nents;
struct ib_mr *fr_mr;
+ struct ib_cqe fr_cqe;
enum rpcrdma_frmr_state fr_state;
+ struct completion fr_linv_done;
struct work_struct fr_work;
struct rpcrdma_xprt *fr_xprt;
+ union {
+ struct ib_reg_wr fr_regwr;
+ struct ib_send_wr fr_invwr;
+ };
};
struct rpcrdma_fmr {
@@ -209,8 +221,7 @@ struct rpcrdma_mw {
union {
struct rpcrdma_fmr fmr;
struct rpcrdma_frmr frmr;
- } r;
- void (*mw_sendcompletion)(struct ib_wc *);
+ };
struct list_head mw_list;
struct list_head mw_all;
};
@@ -255,6 +266,7 @@ struct rpcrdma_mr_seg { /* chunk descriptors */
#define RPCRDMA_MAX_IOVS (2)
struct rpcrdma_req {
+ struct list_head rl_free;
unsigned int rl_niovs;
unsigned int rl_nchunks;
unsigned int rl_connect_cookie;
@@ -264,6 +276,10 @@ struct rpcrdma_req {
struct rpcrdma_regbuf *rl_rdmabuf;
struct rpcrdma_regbuf *rl_sendbuf;
struct rpcrdma_mr_seg rl_segments[RPCRDMA_MAX_SEGS];
+
+ struct ib_cqe rl_cqe;
+ struct list_head rl_all;
+ bool rl_backchannel;
};
static inline struct rpcrdma_req *
@@ -288,12 +304,17 @@ struct rpcrdma_buffer {
struct list_head rb_all;
char *rb_pool;
- spinlock_t rb_lock; /* protect buf arrays */
+ spinlock_t rb_lock; /* protect buf lists */
+ struct list_head rb_send_bufs;
+ struct list_head rb_recv_bufs;
u32 rb_max_requests;
- int rb_send_index;
- int rb_recv_index;
- struct rpcrdma_req **rb_send_bufs;
- struct rpcrdma_rep **rb_recv_bufs;
+ atomic_t rb_credits; /* most recent credit grant */
+
+ u32 rb_bc_srv_max_requests;
+ spinlock_t rb_reqslock; /* protect rb_allreqs */
+ struct list_head rb_allreqs;
+
+ u32 rb_bc_max_requests;
};
#define rdmab_to_ia(b) (&container_of((b), struct rpcrdma_xprt, rx_buf)->rx_ia)
@@ -339,6 +360,7 @@ struct rpcrdma_stats {
unsigned long failed_marshal_count;
unsigned long bad_reply_count;
unsigned long nomsg_call_count;
+ unsigned long bcall_count;
};
/*
@@ -348,6 +370,8 @@ struct rpcrdma_xprt;
struct rpcrdma_memreg_ops {
int (*ro_map)(struct rpcrdma_xprt *,
struct rpcrdma_mr_seg *, int, bool);
+ void (*ro_unmap_sync)(struct rpcrdma_xprt *,
+ struct rpcrdma_req *);
int (*ro_unmap)(struct rpcrdma_xprt *,
struct rpcrdma_mr_seg *);
int (*ro_open)(struct rpcrdma_ia *,
@@ -414,6 +438,9 @@ int rpcrdma_ep_post_recv(struct rpcrdma_ia *, struct rpcrdma_ep *,
/*
* Buffer calls - xprtrdma/verbs.c
*/
+struct rpcrdma_req *rpcrdma_create_req(struct rpcrdma_xprt *);
+struct rpcrdma_rep *rpcrdma_create_rep(struct rpcrdma_xprt *);
+void rpcrdma_destroy_req(struct rpcrdma_ia *, struct rpcrdma_req *);
int rpcrdma_buffer_create(struct rpcrdma_xprt *);
void rpcrdma_buffer_destroy(struct rpcrdma_buffer *);
@@ -430,10 +457,14 @@ void rpcrdma_free_regbuf(struct rpcrdma_ia *,
struct rpcrdma_regbuf *);
unsigned int rpcrdma_max_segments(struct rpcrdma_xprt *);
+int rpcrdma_ep_post_extra_recv(struct rpcrdma_xprt *, unsigned int);
int frwr_alloc_recovery_wq(void);
void frwr_destroy_recovery_wq(void);
+int rpcrdma_alloc_wq(void);
+void rpcrdma_destroy_wq(void);
+
/*
* Wrappers for chunk registration, shared by read/write chunk code.
*/
@@ -491,14 +522,25 @@ int rpcrdma_marshal_req(struct rpc_rqst *);
/* RPC/RDMA module init - xprtrdma/transport.c
*/
+extern unsigned int xprt_rdma_max_inline_read;
+void xprt_rdma_format_addresses(struct rpc_xprt *xprt, struct sockaddr *sap);
+void xprt_rdma_free_addresses(struct rpc_xprt *xprt);
+void xprt_rdma_print_stats(struct rpc_xprt *xprt, struct seq_file *seq);
int xprt_rdma_init(void);
void xprt_rdma_cleanup(void);
-/* Temporary NFS request map cache. Created in svc_rdma.c */
-extern struct kmem_cache *svc_rdma_map_cachep;
-/* WR context cache. Created in svc_rdma.c */
-extern struct kmem_cache *svc_rdma_ctxt_cachep;
-/* Workqueue created in svc_rdma.c */
-extern struct workqueue_struct *svc_rdma_wq;
+/* Backchannel calls - xprtrdma/backchannel.c
+ */
+#if defined(CONFIG_SUNRPC_BACKCHANNEL)
+int xprt_rdma_bc_setup(struct rpc_xprt *, unsigned int);
+int xprt_rdma_bc_up(struct svc_serv *, struct net *);
+int rpcrdma_bc_post_recv(struct rpcrdma_xprt *, unsigned int);
+void rpcrdma_bc_receive_call(struct rpcrdma_xprt *, struct rpcrdma_rep *);
+int rpcrdma_bc_marshal_reply(struct rpc_rqst *);
+void xprt_rdma_bc_free_rqst(struct rpc_rqst *);
+void xprt_rdma_bc_destroy(struct rpc_xprt *, unsigned int);
+#endif /* CONFIG_SUNRPC_BACKCHANNEL */
+
+extern struct xprt_class xprt_rdma_bc;
#endif /* _LINUX_SUNRPC_XPRT_RDMA_H */
diff --git a/net/sunrpc/xprtsock.c b/net/sunrpc/xprtsock.c
index 1a85e0ed0b48..65e759569e48 100644
--- a/net/sunrpc/xprtsock.c
+++ b/net/sunrpc/xprtsock.c
@@ -360,8 +360,10 @@ static int xs_send_pagedata(struct socket *sock, struct xdr_buf *xdr, unsigned i
int flags = XS_SENDMSG_FLAGS;
remainder -= len;
- if (remainder != 0 || more)
+ if (more)
flags |= MSG_MORE;
+ if (remainder != 0)
+ flags |= MSG_SENDPAGE_NOTLAST | MSG_MORE;
err = do_sendpage(sock, *ppage, base, len, flags);
if (remainder == 0 || err != len)
break;
@@ -396,7 +398,6 @@ static int xs_sendpages(struct socket *sock, struct sockaddr *addr, int addrlen,
if (unlikely(!sock))
return -ENOTSOCK;
- clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags);
if (base != 0) {
addr = NULL;
addrlen = 0;
@@ -440,7 +441,6 @@ static void xs_nospace_callback(struct rpc_task *task)
struct sock_xprt *transport = container_of(task->tk_rqstp->rq_xprt, struct sock_xprt, xprt);
transport->inet->sk_write_pending--;
- clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
}
/**
@@ -465,20 +465,11 @@ static int xs_nospace(struct rpc_task *task)
/* Don't race with disconnect */
if (xprt_connected(xprt)) {
- if (test_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags)) {
- /*
- * Notify TCP that we're limited by the application
- * window size
- */
- set_bit(SOCK_NOSPACE, &transport->sock->flags);
- sk->sk_write_pending++;
- /* ...and wait for more buffer space */
- xprt_wait_for_buffer_space(task, xs_nospace_callback);
- }
- } else {
- clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+ /* wait for more buffer space */
+ sk->sk_write_pending++;
+ xprt_wait_for_buffer_space(task, xs_nospace_callback);
+ } else
ret = -ENOTCONN;
- }
spin_unlock_bh(&xprt->transport_lock);
@@ -614,9 +605,6 @@ process_status:
case -EAGAIN:
status = xs_nospace(task);
break;
- default:
- dprintk("RPC: sendmsg returned unrecognized error %d\n",
- -status);
case -ENETUNREACH:
case -ENOBUFS:
case -EPIPE:
@@ -624,7 +612,10 @@ process_status:
case -EPERM:
/* When the server has died, an ICMP port unreachable message
* prompts ECONNREFUSED. */
- clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+ break;
+ default:
+ dprintk("RPC: sendmsg returned unrecognized error %d\n",
+ -status);
}
return status;
@@ -704,16 +695,16 @@ static int xs_tcp_send_request(struct rpc_task *task)
case -EAGAIN:
status = xs_nospace(task);
break;
- default:
- dprintk("RPC: sendmsg returned unrecognized error %d\n",
- -status);
case -ECONNRESET:
case -ECONNREFUSED:
case -ENOTCONN:
case -EADDRINUSE:
case -ENOBUFS:
case -EPIPE:
- clear_bit(SOCK_ASYNC_NOSPACE, &transport->sock->flags);
+ break;
+ default:
+ dprintk("RPC: sendmsg returned unrecognized error %d\n",
+ -status);
}
return status;
@@ -823,6 +814,7 @@ static void xs_reset_transport(struct sock_xprt *transport)
kernel_sock_shutdown(sock, SHUT_RDWR);
+ mutex_lock(&transport->recv_mutex);
write_lock_bh(&sk->sk_callback_lock);
transport->inet = NULL;
transport->sock = NULL;
@@ -833,6 +825,7 @@ static void xs_reset_transport(struct sock_xprt *transport)
xprt_clear_connected(xprt);
write_unlock_bh(&sk->sk_callback_lock);
xs_sock_reset_connection_flags(xprt);
+ mutex_unlock(&transport->recv_mutex);
trace_rpc_socket_close(xprt, sock);
sock_release(sock);
@@ -886,6 +879,7 @@ static void xs_destroy(struct rpc_xprt *xprt)
cancel_delayed_work_sync(&transport->connect_worker);
xs_close(xprt);
+ cancel_work_sync(&transport->recv_worker);
xs_xprt_free(xprt);
module_put(THIS_MODULE);
}
@@ -906,44 +900,36 @@ static int xs_local_copy_to_xdr(struct xdr_buf *xdr, struct sk_buff *skb)
}
/**
- * xs_local_data_ready - "data ready" callback for AF_LOCAL sockets
- * @sk: socket with data to read
+ * xs_local_data_read_skb
+ * @xprt: transport
+ * @sk: socket
+ * @skb: skbuff
*
* Currently this assumes we can read the whole reply in a single gulp.
*/
-static void xs_local_data_ready(struct sock *sk)
+static void xs_local_data_read_skb(struct rpc_xprt *xprt,
+ struct sock *sk,
+ struct sk_buff *skb)
{
struct rpc_task *task;
- struct rpc_xprt *xprt;
struct rpc_rqst *rovr;
- struct sk_buff *skb;
- int err, repsize, copied;
+ int repsize, copied;
u32 _xid;
__be32 *xp;
- read_lock_bh(&sk->sk_callback_lock);
- dprintk("RPC: %s...\n", __func__);
- xprt = xprt_from_sock(sk);
- if (xprt == NULL)
- goto out;
-
- skb = skb_recv_datagram(sk, 0, 1, &err);
- if (skb == NULL)
- goto out;
-
repsize = skb->len - sizeof(rpc_fraghdr);
if (repsize < 4) {
dprintk("RPC: impossible RPC reply size %d\n", repsize);
- goto dropit;
+ return;
}
/* Copy the XID from the skb... */
xp = skb_header_pointer(skb, sizeof(rpc_fraghdr), sizeof(_xid), &_xid);
if (xp == NULL)
- goto dropit;
+ return;
/* Look up and lock the request corresponding to the given XID */
- spin_lock(&xprt->transport_lock);
+ spin_lock_bh(&xprt->transport_lock);
rovr = xprt_lookup_rqst(xprt, *xp);
if (!rovr)
goto out_unlock;
@@ -961,50 +947,68 @@ static void xs_local_data_ready(struct sock *sk)
xprt_complete_rqst(task, copied);
out_unlock:
- spin_unlock(&xprt->transport_lock);
- dropit:
- skb_free_datagram(sk, skb);
- out:
- read_unlock_bh(&sk->sk_callback_lock);
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
+static void xs_local_data_receive(struct sock_xprt *transport)
+{
+ struct sk_buff *skb;
+ struct sock *sk;
+ int err;
+
+ mutex_lock(&transport->recv_mutex);
+ sk = transport->inet;
+ if (sk == NULL)
+ goto out;
+ for (;;) {
+ skb = skb_recv_datagram(sk, 0, 1, &err);
+ if (skb == NULL)
+ break;
+ xs_local_data_read_skb(&transport->xprt, sk, skb);
+ skb_free_datagram(sk, skb);
+ }
+out:
+ mutex_unlock(&transport->recv_mutex);
+}
+
+static void xs_local_data_receive_workfn(struct work_struct *work)
+{
+ struct sock_xprt *transport =
+ container_of(work, struct sock_xprt, recv_worker);
+ xs_local_data_receive(transport);
}
/**
- * xs_udp_data_ready - "data ready" callback for UDP sockets
- * @sk: socket with data to read
+ * xs_udp_data_read_skb - receive callback for UDP sockets
+ * @xprt: transport
+ * @sk: socket
+ * @skb: skbuff
*
*/
-static void xs_udp_data_ready(struct sock *sk)
+static void xs_udp_data_read_skb(struct rpc_xprt *xprt,
+ struct sock *sk,
+ struct sk_buff *skb)
{
struct rpc_task *task;
- struct rpc_xprt *xprt;
struct rpc_rqst *rovr;
- struct sk_buff *skb;
- int err, repsize, copied;
+ int repsize, copied;
u32 _xid;
__be32 *xp;
- read_lock_bh(&sk->sk_callback_lock);
- dprintk("RPC: xs_udp_data_ready...\n");
- if (!(xprt = xprt_from_sock(sk)))
- goto out;
-
- if ((skb = skb_recv_datagram(sk, 0, 1, &err)) == NULL)
- goto out;
-
repsize = skb->len - sizeof(struct udphdr);
if (repsize < 4) {
dprintk("RPC: impossible RPC reply size %d!\n", repsize);
- goto dropit;
+ return;
}
/* Copy the XID from the skb... */
xp = skb_header_pointer(skb, sizeof(struct udphdr),
sizeof(_xid), &_xid);
if (xp == NULL)
- goto dropit;
+ return;
/* Look up and lock the request corresponding to the given XID */
- spin_lock(&xprt->transport_lock);
+ spin_lock_bh(&xprt->transport_lock);
rovr = xprt_lookup_rqst(xprt, *xp);
if (!rovr)
goto out_unlock;
@@ -1025,10 +1029,54 @@ static void xs_udp_data_ready(struct sock *sk)
xprt_complete_rqst(task, copied);
out_unlock:
- spin_unlock(&xprt->transport_lock);
- dropit:
- skb_free_datagram(sk, skb);
- out:
+ spin_unlock_bh(&xprt->transport_lock);
+}
+
+static void xs_udp_data_receive(struct sock_xprt *transport)
+{
+ struct sk_buff *skb;
+ struct sock *sk;
+ int err;
+
+ mutex_lock(&transport->recv_mutex);
+ sk = transport->inet;
+ if (sk == NULL)
+ goto out;
+ for (;;) {
+ skb = skb_recv_datagram(sk, 0, 1, &err);
+ if (skb == NULL)
+ break;
+ xs_udp_data_read_skb(&transport->xprt, sk, skb);
+ skb_free_datagram(sk, skb);
+ }
+out:
+ mutex_unlock(&transport->recv_mutex);
+}
+
+static void xs_udp_data_receive_workfn(struct work_struct *work)
+{
+ struct sock_xprt *transport =
+ container_of(work, struct sock_xprt, recv_worker);
+ xs_udp_data_receive(transport);
+}
+
+/**
+ * xs_data_ready - "data ready" callback for UDP sockets
+ * @sk: socket with data to read
+ *
+ */
+static void xs_data_ready(struct sock *sk)
+{
+ struct rpc_xprt *xprt;
+
+ read_lock_bh(&sk->sk_callback_lock);
+ dprintk("RPC: xs_data_ready...\n");
+ xprt = xprt_from_sock(sk);
+ if (xprt != NULL) {
+ struct sock_xprt *transport = container_of(xprt,
+ struct sock_xprt, xprt);
+ queue_work(rpciod_workqueue, &transport->recv_worker);
+ }
read_unlock_bh(&sk->sk_callback_lock);
}
@@ -1243,12 +1291,12 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
dprintk("RPC: read reply XID %08x\n", ntohl(transport->tcp_xid));
/* Find and lock the request corresponding to this xid */
- spin_lock(&xprt->transport_lock);
+ spin_lock_bh(&xprt->transport_lock);
req = xprt_lookup_rqst(xprt, transport->tcp_xid);
if (!req) {
dprintk("RPC: XID %08x request not found!\n",
ntohl(transport->tcp_xid));
- spin_unlock(&xprt->transport_lock);
+ spin_unlock_bh(&xprt->transport_lock);
return -1;
}
@@ -1257,7 +1305,7 @@ static inline int xs_tcp_read_reply(struct rpc_xprt *xprt,
if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
xprt_complete_rqst(req->rq_task, transport->tcp_copied);
- spin_unlock(&xprt->transport_lock);
+ spin_unlock_bh(&xprt->transport_lock);
return 0;
}
@@ -1277,10 +1325,10 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
struct rpc_rqst *req;
/* Look up and lock the request corresponding to the given XID */
- spin_lock(&xprt->transport_lock);
+ spin_lock_bh(&xprt->transport_lock);
req = xprt_lookup_bc_request(xprt, transport->tcp_xid);
if (req == NULL) {
- spin_unlock(&xprt->transport_lock);
+ spin_unlock_bh(&xprt->transport_lock);
printk(KERN_WARNING "Callback slot table overflowed\n");
xprt_force_disconnect(xprt);
return -1;
@@ -1291,7 +1339,7 @@ static int xs_tcp_read_callback(struct rpc_xprt *xprt,
if (!(transport->tcp_flags & TCP_RCV_COPY_DATA))
xprt_complete_bc_request(req, transport->tcp_copied);
- spin_unlock(&xprt->transport_lock);
+ spin_unlock_bh(&xprt->transport_lock);
return 0;
}
@@ -1306,6 +1354,17 @@ static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
xs_tcp_read_reply(xprt, desc) :
xs_tcp_read_callback(xprt, desc);
}
+
+static int xs_tcp_bc_up(struct svc_serv *serv, struct net *net)
+{
+ int ret;
+
+ ret = svc_create_xprt(serv, "tcp-bc", net, PF_INET, 0,
+ SVC_SOCK_ANONYMOUS);
+ if (ret < 0)
+ return ret;
+ return 0;
+}
#else
static inline int _xs_tcp_read_data(struct rpc_xprt *xprt,
struct xdr_skb_reader *desc)
@@ -1391,6 +1450,44 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns
return len - desc.count;
}
+static void xs_tcp_data_receive(struct sock_xprt *transport)
+{
+ struct rpc_xprt *xprt = &transport->xprt;
+ struct sock *sk;
+ read_descriptor_t rd_desc = {
+ .count = 2*1024*1024,
+ .arg.data = xprt,
+ };
+ unsigned long total = 0;
+ int read = 0;
+
+ mutex_lock(&transport->recv_mutex);
+ sk = transport->inet;
+ if (sk == NULL)
+ goto out;
+
+ /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
+ for (;;) {
+ lock_sock(sk);
+ read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
+ release_sock(sk);
+ if (read <= 0)
+ break;
+ total += read;
+ rd_desc.count = 65536;
+ }
+out:
+ mutex_unlock(&transport->recv_mutex);
+ trace_xs_tcp_data_ready(xprt, read, total);
+}
+
+static void xs_tcp_data_receive_workfn(struct work_struct *work)
+{
+ struct sock_xprt *transport =
+ container_of(work, struct sock_xprt, recv_worker);
+ xs_tcp_data_receive(transport);
+}
+
/**
* xs_tcp_data_ready - "data ready" callback for TCP sockets
* @sk: socket with data to read
@@ -1398,34 +1495,24 @@ static int xs_tcp_data_recv(read_descriptor_t *rd_desc, struct sk_buff *skb, uns
*/
static void xs_tcp_data_ready(struct sock *sk)
{
+ struct sock_xprt *transport;
struct rpc_xprt *xprt;
- read_descriptor_t rd_desc;
- int read;
- unsigned long total = 0;
dprintk("RPC: xs_tcp_data_ready...\n");
read_lock_bh(&sk->sk_callback_lock);
- if (!(xprt = xprt_from_sock(sk))) {
- read = 0;
+ if (!(xprt = xprt_from_sock(sk)))
goto out;
- }
+ transport = container_of(xprt, struct sock_xprt, xprt);
+
/* Any data means we had a useful conversation, so
* the we don't need to delay the next reconnect
*/
if (xprt->reestablish_timeout)
xprt->reestablish_timeout = 0;
+ queue_work(rpciod_workqueue, &transport->recv_worker);
- /* We use rd_desc to pass struct xprt to xs_tcp_data_recv */
- rd_desc.arg.data = xprt;
- do {
- rd_desc.count = 65536;
- read = tcp_read_sock(sk, &rd_desc, xs_tcp_data_recv);
- if (read > 0)
- total += read;
- } while (read > 0);
out:
- trace_xs_tcp_data_ready(xprt, read, total);
read_unlock_bh(&sk->sk_callback_lock);
}
@@ -1511,19 +1598,23 @@ static void xs_tcp_state_change(struct sock *sk)
static void xs_write_space(struct sock *sk)
{
- struct socket *sock;
+ struct socket_wq *wq;
struct rpc_xprt *xprt;
- if (unlikely(!(sock = sk->sk_socket)))
+ if (!sk->sk_socket)
return;
- clear_bit(SOCK_NOSPACE, &sock->flags);
+ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
if (unlikely(!(xprt = xprt_from_sock(sk))))
return;
- if (test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sock->flags) == 0)
- return;
+ rcu_read_lock();
+ wq = rcu_dereference(sk->sk_wq);
+ if (!wq || test_and_clear_bit(SOCKWQ_ASYNC_NOSPACE, &wq->flags) == 0)
+ goto out;
xprt_write_space(xprt);
+out:
+ rcu_read_unlock();
}
/**
@@ -1753,9 +1844,7 @@ static int xs_bind(struct sock_xprt *transport, struct socket *sock)
*/
static void xs_local_rpcbind(struct rpc_task *task)
{
- rcu_read_lock();
- xprt_set_bound(rcu_dereference(task->tk_client->cl_xprt));
- rcu_read_unlock();
+ xprt_set_bound(task->tk_xprt);
}
static void xs_local_set_port(struct rpc_xprt *xprt, unsigned short port)
@@ -1809,18 +1898,6 @@ static inline void xs_reclassify_socket(int family, struct socket *sock)
}
}
#else
-static inline void xs_reclassify_socketu(struct socket *sock)
-{
-}
-
-static inline void xs_reclassify_socket4(struct socket *sock)
-{
-}
-
-static inline void xs_reclassify_socket6(struct socket *sock)
-{
-}
-
static inline void xs_reclassify_socket(int family, struct socket *sock)
{
}
@@ -1873,7 +1950,7 @@ static int xs_local_finish_connecting(struct rpc_xprt *xprt,
xs_save_old_callbacks(transport, sk);
sk->sk_user_data = xprt;
- sk->sk_data_ready = xs_local_data_ready;
+ sk->sk_data_ready = xs_data_ready;
sk->sk_write_space = xs_udp_write_space;
sk->sk_error_report = xs_error_report;
sk->sk_allocation = GFP_NOIO;
@@ -1910,7 +1987,7 @@ static int xs_local_setup_socket(struct sock_xprt *transport)
"transport socket (%d).\n", -status);
goto out;
}
- xs_reclassify_socketu(sock);
+ xs_reclassify_socket(AF_LOCAL, sock);
dprintk("RPC: worker connecting xprt %p via AF_LOCAL to %s\n",
xprt, xprt->address_strings[RPC_DISPLAY_ADDR]);
@@ -2059,7 +2136,7 @@ static void xs_udp_finish_connecting(struct rpc_xprt *xprt, struct socket *sock)
xs_save_old_callbacks(transport, sk);
sk->sk_user_data = xprt;
- sk->sk_data_ready = xs_udp_data_ready;
+ sk->sk_data_ready = xs_data_ready;
sk->sk_write_space = xs_udp_write_space;
sk->sk_allocation = GFP_NOIO;
@@ -2472,7 +2549,7 @@ static int bc_send_request(struct rpc_task *task)
{
struct rpc_rqst *req = task->tk_rqstp;
struct svc_xprt *xprt;
- u32 len;
+ int len;
dprintk("sending request with xid: %08x\n", ntohl(req->rq_xid));
/*
@@ -2580,6 +2657,12 @@ static struct rpc_xprt_ops xs_tcp_ops = {
.enable_swap = xs_enable_swap,
.disable_swap = xs_disable_swap,
.inject_disconnect = xs_inject_disconnect,
+#ifdef CONFIG_SUNRPC_BACKCHANNEL
+ .bc_setup = xprt_setup_bc,
+ .bc_up = xs_tcp_bc_up,
+ .bc_free_rqst = xprt_free_bc_rqst,
+ .bc_destroy = xprt_destroy_bc,
+#endif
};
/*
@@ -2650,6 +2733,7 @@ static struct rpc_xprt *xs_setup_xprt(struct xprt_create *args,
}
new = container_of(xprt, struct sock_xprt, xprt);
+ mutex_init(&new->recv_mutex);
memcpy(&xprt->addr, args->dstaddr, args->addrlen);
xprt->addrlen = args->addrlen;
if (args->srcaddr)
@@ -2703,6 +2787,7 @@ static struct rpc_xprt *xs_setup_local(struct xprt_create *args)
xprt->ops = &xs_local_ops;
xprt->timeout = &xs_local_default_timeout;
+ INIT_WORK(&transport->recv_worker, xs_local_data_receive_workfn);
INIT_DELAYED_WORK(&transport->connect_worker,
xs_dummy_setup_socket);
@@ -2774,21 +2859,20 @@ static struct rpc_xprt *xs_setup_udp(struct xprt_create *args)
xprt->timeout = &xs_udp_default_timeout;
+ INIT_WORK(&transport->recv_worker, xs_udp_data_receive_workfn);
+ INIT_DELAYED_WORK(&transport->connect_worker, xs_udp_setup_socket);
+
switch (addr->sa_family) {
case AF_INET:
if (((struct sockaddr_in *)addr)->sin_port != htons(0))
xprt_set_bound(xprt);
- INIT_DELAYED_WORK(&transport->connect_worker,
- xs_udp_setup_socket);
xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP);
break;
case AF_INET6:
if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
xprt_set_bound(xprt);
- INIT_DELAYED_WORK(&transport->connect_worker,
- xs_udp_setup_socket);
xs_format_peer_addresses(xprt, "udp", RPCBIND_NETID_UDP6);
break;
default:
@@ -2853,21 +2937,20 @@ static struct rpc_xprt *xs_setup_tcp(struct xprt_create *args)
xprt->ops = &xs_tcp_ops;
xprt->timeout = &xs_tcp_default_timeout;
+ INIT_WORK(&transport->recv_worker, xs_tcp_data_receive_workfn);
+ INIT_DELAYED_WORK(&transport->connect_worker, xs_tcp_setup_socket);
+
switch (addr->sa_family) {
case AF_INET:
if (((struct sockaddr_in *)addr)->sin_port != htons(0))
xprt_set_bound(xprt);
- INIT_DELAYED_WORK(&transport->connect_worker,
- xs_tcp_setup_socket);
xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP);
break;
case AF_INET6:
if (((struct sockaddr_in6 *)addr)->sin6_port != htons(0))
xprt_set_bound(xprt);
- INIT_DELAYED_WORK(&transport->connect_worker,
- xs_tcp_setup_socket);
xs_format_peer_addresses(xprt, "tcp", RPCBIND_NETID_TCP6);
break;
default:
diff --git a/net/switchdev/switchdev.c b/net/switchdev/switchdev.c
index f34e535e93bd..2b9b98f1c2ff 100644
--- a/net/switchdev/switchdev.c
+++ b/net/switchdev/switchdev.c
@@ -20,6 +20,7 @@
#include <linux/list.h>
#include <linux/workqueue.h>
#include <linux/if_vlan.h>
+#include <linux/rtnetlink.h>
#include <net/ip_fib.h>
#include <net/switchdev.h>
@@ -345,6 +346,8 @@ static size_t switchdev_obj_size(const struct switchdev_obj *obj)
return sizeof(struct switchdev_obj_ipv4_fib);
case SWITCHDEV_OBJ_ID_PORT_FDB:
return sizeof(struct switchdev_obj_port_fdb);
+ case SWITCHDEV_OBJ_ID_PORT_MDB:
+ return sizeof(struct switchdev_obj_port_mdb);
default:
BUG();
}
@@ -565,7 +568,6 @@ int switchdev_port_obj_dump(struct net_device *dev, struct switchdev_obj *obj,
}
EXPORT_SYMBOL_GPL(switchdev_port_obj_dump);
-static DEFINE_MUTEX(switchdev_mutex);
static RAW_NOTIFIER_HEAD(switchdev_notif_chain);
/**
@@ -580,9 +582,9 @@ int register_switchdev_notifier(struct notifier_block *nb)
{
int err;
- mutex_lock(&switchdev_mutex);
+ rtnl_lock();
err = raw_notifier_chain_register(&switchdev_notif_chain, nb);
- mutex_unlock(&switchdev_mutex);
+ rtnl_unlock();
return err;
}
EXPORT_SYMBOL_GPL(register_switchdev_notifier);
@@ -598,9 +600,9 @@ int unregister_switchdev_notifier(struct notifier_block *nb)
{
int err;
- mutex_lock(&switchdev_mutex);
+ rtnl_lock();
err = raw_notifier_chain_unregister(&switchdev_notif_chain, nb);
- mutex_unlock(&switchdev_mutex);
+ rtnl_unlock();
return err;
}
EXPORT_SYMBOL_GPL(unregister_switchdev_notifier);
@@ -614,16 +616,17 @@ EXPORT_SYMBOL_GPL(unregister_switchdev_notifier);
* Call all network notifier blocks. This should be called by driver
* when it needs to propagate hardware event.
* Return values are same as for atomic_notifier_call_chain().
+ * rtnl_lock must be held.
*/
int call_switchdev_notifiers(unsigned long val, struct net_device *dev,
struct switchdev_notifier_info *info)
{
int err;
+ ASSERT_RTNL();
+
info->dev = dev;
- mutex_lock(&switchdev_mutex);
err = raw_notifier_call_chain(&switchdev_notif_chain, val, info);
- mutex_unlock(&switchdev_mutex);
return err;
}
EXPORT_SYMBOL_GPL(call_switchdev_notifiers);
@@ -723,6 +726,7 @@ static int switchdev_port_vlan_fill(struct sk_buff *skb, struct net_device *dev,
u32 filter_mask)
{
struct switchdev_vlan_dump dump = {
+ .vlan.obj.orig_dev = dev,
.vlan.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
.skb = skb,
.filter_mask = filter_mask,
@@ -757,6 +761,7 @@ int switchdev_port_bridge_getlink(struct sk_buff *skb, u32 pid, u32 seq,
int nlflags)
{
struct switchdev_attr attr = {
+ .orig_dev = dev,
.id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS,
};
u16 mode = BRIDGE_MODE_UNDEF;
@@ -778,6 +783,7 @@ static int switchdev_port_br_setflag(struct net_device *dev,
unsigned long brport_flag)
{
struct switchdev_attr attr = {
+ .orig_dev = dev,
.id = SWITCHDEV_ATTR_ID_PORT_BRIDGE_FLAGS,
};
u8 flag = nla_get_u8(nlattr);
@@ -853,6 +859,7 @@ static int switchdev_port_br_afspec(struct net_device *dev,
struct nlattr *attr;
struct bridge_vlan_info *vinfo;
struct switchdev_obj_port_vlan vlan = {
+ .obj.orig_dev = dev,
.obj.id = SWITCHDEV_OBJ_ID_PORT_VLAN,
};
int rem;
@@ -975,6 +982,7 @@ int switchdev_port_fdb_add(struct ndmsg *ndm, struct nlattr *tb[],
u16 vid, u16 nlm_flags)
{
struct switchdev_obj_port_fdb fdb = {
+ .obj.orig_dev = dev,
.obj.id = SWITCHDEV_OBJ_ID_PORT_FDB,
.vid = vid,
};
@@ -1000,6 +1008,7 @@ int switchdev_port_fdb_del(struct ndmsg *ndm, struct nlattr *tb[],
u16 vid)
{
struct switchdev_obj_port_fdb fdb = {
+ .obj.orig_dev = dev,
.obj.id = SWITCHDEV_OBJ_ID_PORT_FDB,
.vid = vid,
};
@@ -1070,21 +1079,25 @@ nla_put_failure:
* @filter_dev: filter device
* @idx:
*
- * Delete FDB entry from switch device.
+ * Dump FDB entries from switch device.
*/
int switchdev_port_fdb_dump(struct sk_buff *skb, struct netlink_callback *cb,
struct net_device *dev,
struct net_device *filter_dev, int idx)
{
struct switchdev_fdb_dump dump = {
+ .fdb.obj.orig_dev = dev,
.fdb.obj.id = SWITCHDEV_OBJ_ID_PORT_FDB,
.dev = dev,
.skb = skb,
.cb = cb,
.idx = idx,
};
+ int err;
- switchdev_port_obj_dump(dev, &dump.fdb.obj, switchdev_port_fdb_dump_cb);
+ err = switchdev_port_obj_dump(dev, &dump.fdb.obj,
+ switchdev_port_fdb_dump_cb);
+ cb->args[1] = err;
return dump.idx;
}
EXPORT_SYMBOL_GPL(switchdev_port_fdb_dump);
@@ -1135,6 +1148,7 @@ static struct net_device *switchdev_get_dev_by_nhs(struct fib_info *fi)
if (!dev)
return NULL;
+ attr.orig_dev = dev;
if (switchdev_port_attr_get(dev, &attr))
return NULL;
@@ -1194,6 +1208,7 @@ int switchdev_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
if (!dev)
return 0;
+ ipv4_fib.obj.orig_dev = dev;
err = switchdev_port_obj_add(dev, &ipv4_fib.obj);
if (!err)
fi->fib_flags |= RTNH_F_OFFLOAD;
@@ -1238,6 +1253,7 @@ int switchdev_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
if (!dev)
return 0;
+ ipv4_fib.obj.orig_dev = dev;
err = switchdev_port_obj_del(dev, &ipv4_fib.obj);
if (!err)
fi->fib_flags &= ~RTNH_F_OFFLOAD;
@@ -1270,10 +1286,12 @@ static bool switchdev_port_same_parent_id(struct net_device *a,
struct net_device *b)
{
struct switchdev_attr a_attr = {
+ .orig_dev = a,
.id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
.flags = SWITCHDEV_F_NO_RECURSE,
};
struct switchdev_attr b_attr = {
+ .orig_dev = b,
.id = SWITCHDEV_ATTR_ID_PORT_PARENT_ID,
.flags = SWITCHDEV_F_NO_RECURSE,
};
diff --git a/net/tipc/bcast.c b/net/tipc/bcast.c
index 9dc239dfe192..ae469b37d852 100644
--- a/net/tipc/bcast.c
+++ b/net/tipc/bcast.c
@@ -332,131 +332,15 @@ void tipc_bcast_remove_peer(struct net *net, struct tipc_link *rcv_l)
tipc_sk_rcv(net, inputq);
}
-static int __tipc_nl_add_bc_link_stat(struct sk_buff *skb,
- struct tipc_stats *stats)
-{
- int i;
- struct nlattr *nest;
-
- struct nla_map {
- __u32 key;
- __u32 val;
- };
-
- struct nla_map map[] = {
- {TIPC_NLA_STATS_RX_INFO, stats->recv_info},
- {TIPC_NLA_STATS_RX_FRAGMENTS, stats->recv_fragments},
- {TIPC_NLA_STATS_RX_FRAGMENTED, stats->recv_fragmented},
- {TIPC_NLA_STATS_RX_BUNDLES, stats->recv_bundles},
- {TIPC_NLA_STATS_RX_BUNDLED, stats->recv_bundled},
- {TIPC_NLA_STATS_TX_INFO, stats->sent_info},
- {TIPC_NLA_STATS_TX_FRAGMENTS, stats->sent_fragments},
- {TIPC_NLA_STATS_TX_FRAGMENTED, stats->sent_fragmented},
- {TIPC_NLA_STATS_TX_BUNDLES, stats->sent_bundles},
- {TIPC_NLA_STATS_TX_BUNDLED, stats->sent_bundled},
- {TIPC_NLA_STATS_RX_NACKS, stats->recv_nacks},
- {TIPC_NLA_STATS_RX_DEFERRED, stats->deferred_recv},
- {TIPC_NLA_STATS_TX_NACKS, stats->sent_nacks},
- {TIPC_NLA_STATS_TX_ACKS, stats->sent_acks},
- {TIPC_NLA_STATS_RETRANSMITTED, stats->retransmitted},
- {TIPC_NLA_STATS_DUPLICATES, stats->duplicates},
- {TIPC_NLA_STATS_LINK_CONGS, stats->link_congs},
- {TIPC_NLA_STATS_MAX_QUEUE, stats->max_queue_sz},
- {TIPC_NLA_STATS_AVG_QUEUE, stats->queue_sz_counts ?
- (stats->accu_queue_sz / stats->queue_sz_counts) : 0}
- };
-
- nest = nla_nest_start(skb, TIPC_NLA_LINK_STATS);
- if (!nest)
- return -EMSGSIZE;
-
- for (i = 0; i < ARRAY_SIZE(map); i++)
- if (nla_put_u32(skb, map[i].key, map[i].val))
- goto msg_full;
-
- nla_nest_end(skb, nest);
-
- return 0;
-msg_full:
- nla_nest_cancel(skb, nest);
-
- return -EMSGSIZE;
-}
-
-int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg)
-{
- int err;
- void *hdr;
- struct nlattr *attrs;
- struct nlattr *prop;
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_link *bcl = tn->bcl;
-
- if (!bcl)
- return 0;
-
- tipc_bcast_lock(net);
-
- hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
- NLM_F_MULTI, TIPC_NL_LINK_GET);
- if (!hdr)
- return -EMSGSIZE;
-
- attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK);
- if (!attrs)
- goto msg_full;
-
- /* The broadcast link is always up */
- if (nla_put_flag(msg->skb, TIPC_NLA_LINK_UP))
- goto attr_msg_full;
-
- if (nla_put_flag(msg->skb, TIPC_NLA_LINK_BROADCAST))
- goto attr_msg_full;
- if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name))
- goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->rcv_nxt))
- goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->snd_nxt))
- goto attr_msg_full;
-
- prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP);
- if (!prop)
- goto attr_msg_full;
- if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window))
- goto prop_msg_full;
- nla_nest_end(msg->skb, prop);
-
- err = __tipc_nl_add_bc_link_stat(msg->skb, &bcl->stats);
- if (err)
- goto attr_msg_full;
-
- tipc_bcast_unlock(net);
- nla_nest_end(msg->skb, attrs);
- genlmsg_end(msg->skb, hdr);
-
- return 0;
-
-prop_msg_full:
- nla_nest_cancel(msg->skb, prop);
-attr_msg_full:
- nla_nest_cancel(msg->skb, attrs);
-msg_full:
- tipc_bcast_unlock(net);
- genlmsg_cancel(msg->skb, hdr);
-
- return -EMSGSIZE;
-}
-
int tipc_bclink_reset_stats(struct net *net)
{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_link *bcl = tn->bcl;
+ struct tipc_link *l = tipc_bc_sndlink(net);
- if (!bcl)
+ if (!l)
return -ENOPROTOOPT;
tipc_bcast_lock(net);
- memset(&bcl->stats, 0, sizeof(bcl->stats));
+ tipc_link_reset_stats(l);
tipc_bcast_unlock(net);
return 0;
}
@@ -528,13 +412,6 @@ enomem:
return -ENOMEM;
}
-void tipc_bcast_reinit(struct net *net)
-{
- struct tipc_bc_base *b = tipc_bc_base(net);
-
- msg_set_prevnode(b->link->pmsg, tipc_own_addr(net));
-}
-
void tipc_bcast_stop(struct net *net)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
diff --git a/net/tipc/bcast.h b/net/tipc/bcast.h
index 2855b9356a15..d5e79b3767fd 100644
--- a/net/tipc/bcast.h
+++ b/net/tipc/bcast.h
@@ -43,9 +43,9 @@ struct tipc_node;
struct tipc_msg;
struct tipc_nl_msg;
struct tipc_node_map;
+extern const char tipc_bclink_name[];
int tipc_bcast_init(struct net *net);
-void tipc_bcast_reinit(struct net *net);
void tipc_bcast_stop(struct net *net);
void tipc_bcast_add_peer(struct net *net, struct tipc_link *l,
struct sk_buff_head *xmitq);
diff --git a/net/tipc/bearer.c b/net/tipc/bearer.c
index 648f2a67f314..27a5406213c6 100644
--- a/net/tipc/bearer.c
+++ b/net/tipc/bearer.c
@@ -40,6 +40,7 @@
#include "link.h"
#include "discover.h"
#include "bcast.h"
+#include "netlink.h"
#define MAX_ADDR_STR 60
@@ -54,24 +55,7 @@ static struct tipc_media * const media_info_array[] = {
NULL
};
-static const struct nla_policy
-tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = {
- [TIPC_NLA_BEARER_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_BEARER_NAME] = {
- .type = NLA_STRING,
- .len = TIPC_MAX_BEARER_NAME
- },
- [TIPC_NLA_BEARER_PROP] = { .type = NLA_NESTED },
- [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 }
-};
-
-static const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = {
- [TIPC_NLA_MEDIA_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_MEDIA_NAME] = { .type = NLA_STRING },
- [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED }
-};
-
-static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr);
+static void bearer_disable(struct net *net, struct tipc_bearer *b);
/**
* tipc_media_find - locates specified media object by name
@@ -107,13 +91,13 @@ static struct tipc_media *media_find_id(u8 type)
void tipc_media_addr_printf(char *buf, int len, struct tipc_media_addr *a)
{
char addr_str[MAX_ADDR_STR];
- struct tipc_media *m_ptr;
+ struct tipc_media *m;
int ret;
- m_ptr = media_find_id(a->media_id);
+ m = media_find_id(a->media_id);
- if (m_ptr && !m_ptr->addr2str(a, addr_str, sizeof(addr_str)))
- ret = scnprintf(buf, len, "%s(%s)", m_ptr->name, addr_str);
+ if (m && !m->addr2str(a, addr_str, sizeof(addr_str)))
+ ret = scnprintf(buf, len, "%s(%s)", m->name, addr_str);
else {
u32 i;
@@ -175,13 +159,13 @@ static int bearer_name_validate(const char *name,
struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_bearer *b_ptr;
+ struct tipc_bearer *b;
u32 i;
for (i = 0; i < MAX_BEARERS; i++) {
- b_ptr = rtnl_dereference(tn->bearer_list[i]);
- if (b_ptr && (!strcmp(b_ptr->name, name)))
- return b_ptr;
+ b = rtnl_dereference(tn->bearer_list[i]);
+ if (b && (!strcmp(b->name, name)))
+ return b;
}
return NULL;
}
@@ -189,24 +173,24 @@ struct tipc_bearer *tipc_bearer_find(struct net *net, const char *name)
void tipc_bearer_add_dest(struct net *net, u32 bearer_id, u32 dest)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_bearer *b_ptr;
+ struct tipc_bearer *b;
rcu_read_lock();
- b_ptr = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
- if (b_ptr)
- tipc_disc_add_dest(b_ptr->link_req);
+ b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
+ if (b)
+ tipc_disc_add_dest(b->link_req);
rcu_read_unlock();
}
void tipc_bearer_remove_dest(struct net *net, u32 bearer_id, u32 dest)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_bearer *b_ptr;
+ struct tipc_bearer *b;
rcu_read_lock();
- b_ptr = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
- if (b_ptr)
- tipc_disc_remove_dest(b_ptr->link_req);
+ b = rcu_dereference_rtnl(tn->bearer_list[bearer_id]);
+ if (b)
+ tipc_disc_remove_dest(b->link_req);
rcu_read_unlock();
}
@@ -218,8 +202,8 @@ static int tipc_enable_bearer(struct net *net, const char *name,
struct nlattr *attr[])
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_bearer *b_ptr;
- struct tipc_media *m_ptr;
+ struct tipc_bearer *b;
+ struct tipc_media *m;
struct tipc_bearer_names b_names;
char addr_string[16];
u32 bearer_id;
@@ -255,31 +239,31 @@ static int tipc_enable_bearer(struct net *net, const char *name,
return -EINVAL;
}
- m_ptr = tipc_media_find(b_names.media_name);
- if (!m_ptr) {
+ m = tipc_media_find(b_names.media_name);
+ if (!m) {
pr_warn("Bearer <%s> rejected, media <%s> not registered\n",
name, b_names.media_name);
return -EINVAL;
}
if (priority == TIPC_MEDIA_LINK_PRI)
- priority = m_ptr->priority;
+ priority = m->priority;
restart:
bearer_id = MAX_BEARERS;
with_this_prio = 1;
for (i = MAX_BEARERS; i-- != 0; ) {
- b_ptr = rtnl_dereference(tn->bearer_list[i]);
- if (!b_ptr) {
+ b = rtnl_dereference(tn->bearer_list[i]);
+ if (!b) {
bearer_id = i;
continue;
}
- if (!strcmp(name, b_ptr->name)) {
+ if (!strcmp(name, b->name)) {
pr_warn("Bearer <%s> rejected, already enabled\n",
name);
return -EINVAL;
}
- if ((b_ptr->priority == priority) &&
+ if ((b->priority == priority) &&
(++with_this_prio > 2)) {
if (priority-- == 0) {
pr_warn("Bearer <%s> rejected, duplicate priority\n",
@@ -297,35 +281,35 @@ restart:
return -EINVAL;
}
- b_ptr = kzalloc(sizeof(*b_ptr), GFP_ATOMIC);
- if (!b_ptr)
+ b = kzalloc(sizeof(*b), GFP_ATOMIC);
+ if (!b)
return -ENOMEM;
- strcpy(b_ptr->name, name);
- b_ptr->media = m_ptr;
- res = m_ptr->enable_media(net, b_ptr, attr);
+ strcpy(b->name, name);
+ b->media = m;
+ res = m->enable_media(net, b, attr);
if (res) {
pr_warn("Bearer <%s> rejected, enable failure (%d)\n",
name, -res);
return -EINVAL;
}
- b_ptr->identity = bearer_id;
- b_ptr->tolerance = m_ptr->tolerance;
- b_ptr->window = m_ptr->window;
- b_ptr->domain = disc_domain;
- b_ptr->net_plane = bearer_id + 'A';
- b_ptr->priority = priority;
+ b->identity = bearer_id;
+ b->tolerance = m->tolerance;
+ b->window = m->window;
+ b->domain = disc_domain;
+ b->net_plane = bearer_id + 'A';
+ b->priority = priority;
- res = tipc_disc_create(net, b_ptr, &b_ptr->bcast_addr);
+ res = tipc_disc_create(net, b, &b->bcast_addr);
if (res) {
- bearer_disable(net, b_ptr);
+ bearer_disable(net, b);
pr_warn("Bearer <%s> rejected, discovery object creation failed\n",
name);
return -EINVAL;
}
- rcu_assign_pointer(tn->bearer_list[bearer_id], b_ptr);
+ rcu_assign_pointer(tn->bearer_list[bearer_id], b);
pr_info("Enabled bearer <%s>, discovery domain %s, priority %u\n",
name,
@@ -336,11 +320,11 @@ restart:
/**
* tipc_reset_bearer - Reset all links established over this bearer
*/
-static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b_ptr)
+static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b)
{
- pr_info("Resetting bearer <%s>\n", b_ptr->name);
- tipc_node_delete_links(net, b_ptr->identity);
- tipc_disc_reset(net, b_ptr);
+ pr_info("Resetting bearer <%s>\n", b->name);
+ tipc_node_delete_links(net, b->identity);
+ tipc_disc_reset(net, b);
return 0;
}
@@ -349,26 +333,26 @@ static int tipc_reset_bearer(struct net *net, struct tipc_bearer *b_ptr)
*
* Note: This routine assumes caller holds RTNL lock.
*/
-static void bearer_disable(struct net *net, struct tipc_bearer *b_ptr)
+static void bearer_disable(struct net *net, struct tipc_bearer *b)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
u32 i;
- pr_info("Disabling bearer <%s>\n", b_ptr->name);
- b_ptr->media->disable_media(b_ptr);
+ pr_info("Disabling bearer <%s>\n", b->name);
+ b->media->disable_media(b);
- tipc_node_delete_links(net, b_ptr->identity);
- RCU_INIT_POINTER(b_ptr->media_ptr, NULL);
- if (b_ptr->link_req)
- tipc_disc_delete(b_ptr->link_req);
+ tipc_node_delete_links(net, b->identity);
+ RCU_INIT_POINTER(b->media_ptr, NULL);
+ if (b->link_req)
+ tipc_disc_delete(b->link_req);
for (i = 0; i < MAX_BEARERS; i++) {
- if (b_ptr == rtnl_dereference(tn->bearer_list[i])) {
+ if (b == rtnl_dereference(tn->bearer_list[i])) {
RCU_INIT_POINTER(tn->bearer_list[i], NULL);
break;
}
}
- kfree_rcu(b_ptr, rcu);
+ kfree_rcu(b, rcu);
}
int tipc_enable_l2_media(struct net *net, struct tipc_bearer *b,
@@ -411,7 +395,7 @@ void tipc_disable_l2_media(struct tipc_bearer *b)
/**
* tipc_l2_send_msg - send a TIPC packet out over an L2 interface
* @buf: the packet to be sent
- * @b_ptr: the bearer through which the packet is to be sent
+ * @b: the bearer through which the packet is to be sent
* @dest: peer destination address
*/
int tipc_l2_send_msg(struct net *net, struct sk_buff *skb,
@@ -532,14 +516,14 @@ void tipc_bearer_bc_xmit(struct net *net, u32 bearer_id,
static int tipc_l2_rcv_msg(struct sk_buff *buf, struct net_device *dev,
struct packet_type *pt, struct net_device *orig_dev)
{
- struct tipc_bearer *b_ptr;
+ struct tipc_bearer *b;
rcu_read_lock();
- b_ptr = rcu_dereference_rtnl(dev->tipc_ptr);
- if (likely(b_ptr)) {
+ b = rcu_dereference_rtnl(dev->tipc_ptr);
+ if (likely(b)) {
if (likely(buf->pkt_type <= PACKET_BROADCAST)) {
buf->next = NULL;
- tipc_rcv(dev_net(dev), buf, b_ptr);
+ tipc_rcv(dev_net(dev), buf, b);
rcu_read_unlock();
return NET_RX_SUCCESS;
}
@@ -564,13 +548,13 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
struct net *net = dev_net(dev);
- struct tipc_bearer *b_ptr;
+ struct tipc_bearer *b;
- b_ptr = rtnl_dereference(dev->tipc_ptr);
- if (!b_ptr)
+ b = rtnl_dereference(dev->tipc_ptr);
+ if (!b)
return NOTIFY_DONE;
- b_ptr->mtu = dev->mtu;
+ b->mtu = dev->mtu;
switch (evt) {
case NETDEV_CHANGE:
@@ -578,16 +562,16 @@ static int tipc_l2_device_event(struct notifier_block *nb, unsigned long evt,
break;
case NETDEV_GOING_DOWN:
case NETDEV_CHANGEMTU:
- tipc_reset_bearer(net, b_ptr);
+ tipc_reset_bearer(net, b);
break;
case NETDEV_CHANGEADDR:
- b_ptr->media->raw2addr(b_ptr, &b_ptr->addr,
+ b->media->raw2addr(b, &b->addr,
(char *)dev->dev_addr);
- tipc_reset_bearer(net, b_ptr);
+ tipc_reset_bearer(net, b);
break;
case NETDEV_UNREGISTER:
case NETDEV_CHANGENAME:
- bearer_disable(dev_net(dev), b_ptr);
+ bearer_disable(dev_net(dev), b);
break;
}
return NOTIFY_OK;
@@ -623,13 +607,13 @@ void tipc_bearer_cleanup(void)
void tipc_bearer_stop(struct net *net)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_bearer *b_ptr;
+ struct tipc_bearer *b;
u32 i;
for (i = 0; i < MAX_BEARERS; i++) {
- b_ptr = rtnl_dereference(tn->bearer_list[i]);
- if (b_ptr) {
- bearer_disable(net, b_ptr);
+ b = rtnl_dereference(tn->bearer_list[i]);
+ if (b) {
+ bearer_disable(net, b);
tn->bearer_list[i] = NULL;
}
}
diff --git a/net/tipc/bearer.h b/net/tipc/bearer.h
index 552185bc4773..e31820516774 100644
--- a/net/tipc/bearer.h
+++ b/net/tipc/bearer.h
@@ -103,11 +103,11 @@ struct tipc_bearer;
*/
struct tipc_media {
int (*send_msg)(struct net *net, struct sk_buff *buf,
- struct tipc_bearer *b_ptr,
+ struct tipc_bearer *b,
struct tipc_media_addr *dest);
- int (*enable_media)(struct net *net, struct tipc_bearer *b_ptr,
+ int (*enable_media)(struct net *net, struct tipc_bearer *b,
struct nlattr *attr[]);
- void (*disable_media)(struct tipc_bearer *b_ptr);
+ void (*disable_media)(struct tipc_bearer *b);
int (*addr2str)(struct tipc_media_addr *addr,
char *strbuf,
int bufsz);
@@ -176,7 +176,7 @@ struct tipc_bearer_names {
* TIPC routines available to supported media types
*/
-void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b_ptr);
+void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b);
/*
* Routines made available to TIPC by supported media types
diff --git a/net/tipc/core.h b/net/tipc/core.h
index 18e95a8020cd..5504d63503df 100644
--- a/net/tipc/core.h
+++ b/net/tipc/core.h
@@ -118,6 +118,11 @@ static inline int tipc_netid(struct net *net)
return tipc_net(net)->net_id;
}
+static inline struct list_head *tipc_nodes(struct net *net)
+{
+ return &tipc_net(net)->node_list;
+}
+
static inline u16 mod(u16 x)
{
return x & 0xffffu;
diff --git a/net/tipc/discover.c b/net/tipc/discover.c
index afe8c47c4085..f1e738e80535 100644
--- a/net/tipc/discover.c
+++ b/net/tipc/discover.c
@@ -75,14 +75,14 @@ struct tipc_link_req {
* tipc_disc_init_msg - initialize a link setup message
* @net: the applicable net namespace
* @type: message type (request or response)
- * @b_ptr: ptr to bearer issuing message
+ * @b: ptr to bearer issuing message
*/
static void tipc_disc_init_msg(struct net *net, struct sk_buff *buf, u32 type,
- struct tipc_bearer *b_ptr)
+ struct tipc_bearer *b)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
struct tipc_msg *msg;
- u32 dest_domain = b_ptr->domain;
+ u32 dest_domain = b->domain;
msg = buf_msg(buf);
tipc_msg_init(tn->own_addr, msg, LINK_CONFIG, type,
@@ -92,16 +92,16 @@ static void tipc_disc_init_msg(struct net *net, struct sk_buff *buf, u32 type,
msg_set_node_capabilities(msg, TIPC_NODE_CAPABILITIES);
msg_set_dest_domain(msg, dest_domain);
msg_set_bc_netid(msg, tn->net_id);
- b_ptr->media->addr2msg(msg_media_addr(msg), &b_ptr->addr);
+ b->media->addr2msg(msg_media_addr(msg), &b->addr);
}
/**
* disc_dupl_alert - issue node address duplication alert
- * @b_ptr: pointer to bearer detecting duplication
+ * @b: pointer to bearer detecting duplication
* @node_addr: duplicated node address
* @media_addr: media address advertised by duplicated node
*/
-static void disc_dupl_alert(struct tipc_bearer *b_ptr, u32 node_addr,
+static void disc_dupl_alert(struct tipc_bearer *b, u32 node_addr,
struct tipc_media_addr *media_addr)
{
char node_addr_str[16];
@@ -111,7 +111,7 @@ static void disc_dupl_alert(struct tipc_bearer *b_ptr, u32 node_addr,
tipc_media_addr_printf(media_addr_str, sizeof(media_addr_str),
media_addr);
pr_warn("Duplicate %s using %s seen on <%s>\n", node_addr_str,
- media_addr_str, b_ptr->name);
+ media_addr_str, b->name);
}
/**
@@ -261,13 +261,13 @@ exit:
/**
* tipc_disc_create - create object to send periodic link setup requests
* @net: the applicable net namespace
- * @b_ptr: ptr to bearer issuing requests
+ * @b: ptr to bearer issuing requests
* @dest: destination address for request messages
* @dest_domain: network domain to which links can be established
*
* Returns 0 if successful, otherwise -errno.
*/
-int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr,
+int tipc_disc_create(struct net *net, struct tipc_bearer *b,
struct tipc_media_addr *dest)
{
struct tipc_link_req *req;
@@ -282,17 +282,17 @@ int tipc_disc_create(struct net *net, struct tipc_bearer *b_ptr,
return -ENOMEM;
}
- tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b_ptr);
+ tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b);
memcpy(&req->dest, dest, sizeof(*dest));
req->net = net;
- req->bearer_id = b_ptr->identity;
- req->domain = b_ptr->domain;
+ req->bearer_id = b->identity;
+ req->domain = b->domain;
req->num_nodes = 0;
req->timer_intv = TIPC_LINK_REQ_INIT;
spin_lock_init(&req->lock);
setup_timer(&req->timer, disc_timeout, (unsigned long)req);
mod_timer(&req->timer, jiffies + req->timer_intv);
- b_ptr->link_req = req;
+ b->link_req = req;
skb = skb_clone(req->buf, GFP_ATOMIC);
if (skb)
tipc_bearer_xmit_skb(net, req->bearer_id, skb, &req->dest);
@@ -313,19 +313,19 @@ void tipc_disc_delete(struct tipc_link_req *req)
/**
* tipc_disc_reset - reset object to send periodic link setup requests
* @net: the applicable net namespace
- * @b_ptr: ptr to bearer issuing requests
+ * @b: ptr to bearer issuing requests
* @dest_domain: network domain to which links can be established
*/
-void tipc_disc_reset(struct net *net, struct tipc_bearer *b_ptr)
+void tipc_disc_reset(struct net *net, struct tipc_bearer *b)
{
- struct tipc_link_req *req = b_ptr->link_req;
+ struct tipc_link_req *req = b->link_req;
struct sk_buff *skb;
spin_lock_bh(&req->lock);
- tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b_ptr);
+ tipc_disc_init_msg(net, req->buf, DSC_REQ_MSG, b);
req->net = net;
- req->bearer_id = b_ptr->identity;
- req->domain = b_ptr->domain;
+ req->bearer_id = b->identity;
+ req->domain = b->domain;
req->num_nodes = 0;
req->timer_intv = TIPC_LINK_REQ_INIT;
mod_timer(&req->timer, jiffies + req->timer_intv);
diff --git a/net/tipc/link.c b/net/tipc/link.c
index 9efbdbde2b08..7d2bb3e70baa 100644
--- a/net/tipc/link.c
+++ b/net/tipc/link.c
@@ -1,7 +1,7 @@
/*
* net/tipc/link.c: TIPC link code
*
- * Copyright (c) 1996-2007, 2012-2015, Ericsson AB
+ * Copyright (c) 1996-2007, 2012-2016, Ericsson AB
* Copyright (c) 2004-2007, 2010-2013, Wind River Systems
* All rights reserved.
*
@@ -45,36 +45,152 @@
#include <linux/pkt_sched.h>
+struct tipc_stats {
+ u32 sent_info; /* used in counting # sent packets */
+ u32 recv_info; /* used in counting # recv'd packets */
+ u32 sent_states;
+ u32 recv_states;
+ u32 sent_probes;
+ u32 recv_probes;
+ u32 sent_nacks;
+ u32 recv_nacks;
+ u32 sent_acks;
+ u32 sent_bundled;
+ u32 sent_bundles;
+ u32 recv_bundled;
+ u32 recv_bundles;
+ u32 retransmitted;
+ u32 sent_fragmented;
+ u32 sent_fragments;
+ u32 recv_fragmented;
+ u32 recv_fragments;
+ u32 link_congs; /* # port sends blocked by congestion */
+ u32 deferred_recv;
+ u32 duplicates;
+ u32 max_queue_sz; /* send queue size high water mark */
+ u32 accu_queue_sz; /* used for send queue size profiling */
+ u32 queue_sz_counts; /* used for send queue size profiling */
+ u32 msg_length_counts; /* used for message length profiling */
+ u32 msg_lengths_total; /* used for message length profiling */
+ u32 msg_length_profile[7]; /* used for msg. length profiling */
+};
+
+/**
+ * struct tipc_link - TIPC link data structure
+ * @addr: network address of link's peer node
+ * @name: link name character string
+ * @media_addr: media address to use when sending messages over link
+ * @timer: link timer
+ * @net: pointer to namespace struct
+ * @refcnt: reference counter for permanent references (owner node & timer)
+ * @peer_session: link session # being used by peer end of link
+ * @peer_bearer_id: bearer id used by link's peer endpoint
+ * @bearer_id: local bearer id used by link
+ * @tolerance: minimum link continuity loss needed to reset link [in ms]
+ * @keepalive_intv: link keepalive timer interval
+ * @abort_limit: # of unacknowledged continuity probes needed to reset link
+ * @state: current state of link FSM
+ * @peer_caps: bitmap describing capabilities of peer node
+ * @silent_intv_cnt: # of timer intervals without any reception from peer
+ * @proto_msg: template for control messages generated by link
+ * @pmsg: convenience pointer to "proto_msg" field
+ * @priority: current link priority
+ * @net_plane: current link network plane ('A' through 'H')
+ * @backlog_limit: backlog queue congestion thresholds (indexed by importance)
+ * @exp_msg_count: # of tunnelled messages expected during link changeover
+ * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset
+ * @mtu: current maximum packet size for this link
+ * @advertised_mtu: advertised own mtu when link is being established
+ * @transmitq: queue for sent, non-acked messages
+ * @backlogq: queue for messages waiting to be sent
+ * @snt_nxt: next sequence number to use for outbound messages
+ * @last_retransmitted: sequence number of most recently retransmitted message
+ * @stale_count: # of identical retransmit requests made by peer
+ * @ackers: # of peers that needs to ack each packet before it can be released
+ * @acked: # last packet acked by a certain peer. Used for broadcast.
+ * @rcv_nxt: next sequence number to expect for inbound messages
+ * @deferred_queue: deferred queue saved OOS b'cast message received from node
+ * @unacked_window: # of inbound messages rx'd without ack'ing back to peer
+ * @inputq: buffer queue for messages to be delivered upwards
+ * @namedq: buffer queue for name table messages to be delivered upwards
+ * @next_out: ptr to first unsent outbound message in queue
+ * @wakeupq: linked list of wakeup msgs waiting for link congestion to abate
+ * @long_msg_seq_no: next identifier to use for outbound fragmented messages
+ * @reasm_buf: head of partially reassembled inbound message fragments
+ * @bc_rcvr: marks that this is a broadcast receiver link
+ * @stats: collects statistics regarding link activity
+ */
+struct tipc_link {
+ u32 addr;
+ char name[TIPC_MAX_LINK_NAME];
+ struct net *net;
+
+ /* Management and link supervision data */
+ u32 peer_session;
+ u32 session;
+ u32 peer_bearer_id;
+ u32 bearer_id;
+ u32 tolerance;
+ unsigned long keepalive_intv;
+ u32 abort_limit;
+ u32 state;
+ u16 peer_caps;
+ bool active;
+ u32 silent_intv_cnt;
+ char if_name[TIPC_MAX_IF_NAME];
+ u32 priority;
+ char net_plane;
+
+ /* Failover/synch */
+ u16 drop_point;
+ struct sk_buff *failover_reasm_skb;
+
+ /* Max packet negotiation */
+ u16 mtu;
+ u16 advertised_mtu;
+
+ /* Sending */
+ struct sk_buff_head transmq;
+ struct sk_buff_head backlogq;
+ struct {
+ u16 len;
+ u16 limit;
+ } backlog[5];
+ u16 snd_nxt;
+ u16 last_retransm;
+ u16 window;
+ u32 stale_count;
+
+ /* Reception */
+ u16 rcv_nxt;
+ u32 rcv_unacked;
+ struct sk_buff_head deferdq;
+ struct sk_buff_head *inputq;
+ struct sk_buff_head *namedq;
+
+ /* Congestion handling */
+ struct sk_buff_head wakeupq;
+
+ /* Fragmentation/reassembly */
+ struct sk_buff *reasm_buf;
+
+ /* Broadcast */
+ u16 ackers;
+ u16 acked;
+ struct tipc_link *bc_rcvlink;
+ struct tipc_link *bc_sndlink;
+ int nack_state;
+ bool bc_peer_is_up;
+
+ /* Statistics */
+ struct tipc_stats stats;
+};
+
/*
* Error message prefixes
*/
static const char *link_co_err = "Link tunneling error, ";
static const char *link_rst_msg = "Resetting link ";
-static const char tipc_bclink_name[] = "broadcast-link";
-
-static const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {
- [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_LINK_NAME] = {
- .type = NLA_STRING,
- .len = TIPC_MAX_LINK_NAME
- },
- [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 },
- [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG },
- [TIPC_NLA_LINK_UP] = { .type = NLA_FLAG },
- [TIPC_NLA_LINK_ACTIVE] = { .type = NLA_FLAG },
- [TIPC_NLA_LINK_PROP] = { .type = NLA_NESTED },
- [TIPC_NLA_LINK_STATS] = { .type = NLA_NESTED },
- [TIPC_NLA_LINK_RX] = { .type = NLA_U32 },
- [TIPC_NLA_LINK_TX] = { .type = NLA_U32 }
-};
-
-/* Properties valid for media, bearar and link */
-static const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = {
- [TIPC_NLA_PROP_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 },
- [TIPC_NLA_PROP_TOL] = { .type = NLA_U32 },
- [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 }
-};
/* Send states for broadcast NACKs
*/
@@ -88,10 +204,11 @@ enum {
* Interval between NACKs when packets arrive out of order
*/
#define TIPC_NACK_INTV (TIPC_MIN_LINK_WIN * 2)
-/*
- * Out-of-range value for link session numbers
+
+/* Wildcard value for link session numbers. When it is known that
+ * peer endpoint is down, any session number must be accepted.
*/
-#define WILDCARD_SESSION 0x10000
+#define ANY_SESSION 0x10000
/* Link FSM states:
*/
@@ -117,8 +234,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
u16 rcvgap, int tolerance, int priority,
struct sk_buff_head *xmitq);
-static void link_reset_statistics(struct tipc_link *l_ptr);
-static void link_print(struct tipc_link *l_ptr, const char *str);
+static void link_print(struct tipc_link *l, const char *str);
static void tipc_link_build_nack_msg(struct tipc_link *l,
struct sk_buff_head *xmitq);
static void tipc_link_build_bc_init_msg(struct tipc_link *l,
@@ -183,6 +299,36 @@ void tipc_link_set_active(struct tipc_link *l, bool active)
l->active = active;
}
+u32 tipc_link_id(struct tipc_link *l)
+{
+ return l->peer_bearer_id << 16 | l->bearer_id;
+}
+
+int tipc_link_window(struct tipc_link *l)
+{
+ return l->window;
+}
+
+int tipc_link_prio(struct tipc_link *l)
+{
+ return l->priority;
+}
+
+unsigned long tipc_link_tolerance(struct tipc_link *l)
+{
+ return l->tolerance;
+}
+
+struct sk_buff_head *tipc_link_inputq(struct tipc_link *l)
+{
+ return l->inputq;
+}
+
+char tipc_link_plane(struct tipc_link *l)
+{
+ return l->net_plane;
+}
+
void tipc_link_add_bc_peer(struct tipc_link *snd_l,
struct tipc_link *uc_l,
struct sk_buff_head *xmitq)
@@ -191,6 +337,7 @@ void tipc_link_add_bc_peer(struct tipc_link *snd_l,
snd_l->ackers++;
rcv_l->acked = snd_l->snd_nxt - 1;
+ snd_l->state = LINK_ESTABLISHED;
tipc_link_build_bc_init_msg(uc_l, xmitq);
}
@@ -206,6 +353,7 @@ void tipc_link_remove_bc_peer(struct tipc_link *snd_l,
rcv_l->state = LINK_RESET;
if (!snd_l->ackers) {
tipc_link_reset(snd_l);
+ snd_l->state = LINK_RESET;
__skb_queue_purge(xmitq);
}
}
@@ -225,9 +373,19 @@ int tipc_link_mtu(struct tipc_link *l)
return l->mtu;
}
-static u32 link_own_addr(struct tipc_link *l)
+u16 tipc_link_rcv_nxt(struct tipc_link *l)
+{
+ return l->rcv_nxt;
+}
+
+u16 tipc_link_acked(struct tipc_link *l)
{
- return msg_prevnode(l->pmsg);
+ return l->acked;
+}
+
+char *tipc_link_name(struct tipc_link *l)
+{
+ return l->name;
}
/**
@@ -263,29 +421,22 @@ bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
struct tipc_link **link)
{
struct tipc_link *l;
- struct tipc_msg *hdr;
l = kzalloc(sizeof(*l), GFP_ATOMIC);
if (!l)
return false;
*link = l;
- l->pmsg = (struct tipc_msg *)&l->proto_msg;
- hdr = l->pmsg;
- tipc_msg_init(ownnode, hdr, LINK_PROTOCOL, RESET_MSG, INT_H_SIZE, peer);
- msg_set_size(hdr, sizeof(l->proto_msg));
- msg_set_session(hdr, session);
- msg_set_bearer_id(hdr, l->bearer_id);
+ l->session = session;
/* Note: peer i/f name is completed by reset/activate message */
sprintf(l->name, "%u.%u.%u:%s-%u.%u.%u:unknown",
tipc_zone(ownnode), tipc_cluster(ownnode), tipc_node(ownnode),
if_name, tipc_zone(peer), tipc_cluster(peer), tipc_node(peer));
- strcpy((char *)msg_data(hdr), if_name);
-
+ strcpy(l->if_name, if_name);
l->addr = peer;
l->peer_caps = peer_caps;
l->net = net;
- l->peer_session = WILDCARD_SESSION;
+ l->peer_session = ANY_SESSION;
l->bearer_id = bearer_id;
l->tolerance = tolerance;
l->net_plane = net_plane;
@@ -612,7 +763,7 @@ static int link_schedule_user(struct tipc_link *link, struct sk_buff_head *list)
struct tipc_msg *msg = buf_msg(skb_peek(list));
int imp = msg_importance(msg);
u32 oport = msg_origport(msg);
- u32 addr = link_own_addr(link);
+ u32 addr = tipc_own_addr(link->net);
struct sk_buff *skb;
/* This really cannot happen... */
@@ -661,16 +812,9 @@ void link_prepare_wakeup(struct tipc_link *l)
void tipc_link_reset(struct tipc_link *l)
{
- /* Link is down, accept any session */
- l->peer_session = WILDCARD_SESSION;
-
- /* If peer is up, it only accepts an incremented session number */
- msg_set_session(l->pmsg, msg_session(l->pmsg) + 1);
-
- /* Prepare for renewed mtu size negotiation */
+ l->peer_session = ANY_SESSION;
+ l->session++;
l->mtu = l->advertised_mtu;
-
- /* Clean up all queues and counters: */
__skb_queue_purge(&l->transmq);
__skb_queue_purge(&l->deferdq);
skb_queue_splice_init(&l->wakeupq, l->inputq);
@@ -692,7 +836,7 @@ void tipc_link_reset(struct tipc_link *l)
l->stats.recv_info = 0;
l->stale_count = 0;
l->bc_peer_is_up = false;
- link_reset_statistics(l);
+ tipc_link_reset_stats(l);
}
/**
@@ -725,8 +869,10 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
if (unlikely(l->backlog[i].len >= l->backlog[i].limit))
return link_schedule_user(l, list);
}
- if (unlikely(msg_size(hdr) > mtu))
+ if (unlikely(msg_size(hdr) > mtu)) {
+ skb_queue_purge(list);
return -EMSGSIZE;
+ }
/* Prepare each packet for sending, and add to relevant queue: */
while (skb_queue_len(list)) {
@@ -738,8 +884,10 @@ int tipc_link_xmit(struct tipc_link *l, struct sk_buff_head *list,
if (likely(skb_queue_len(transmq) < maxwin)) {
_skb = skb_clone(skb, GFP_ATOMIC);
- if (!_skb)
+ if (!_skb) {
+ skb_queue_purge(list);
return -ENOBUFS;
+ }
__skb_dequeue(list);
__skb_queue_tail(transmq, skb);
__skb_queue_tail(xmitq, _skb);
@@ -974,7 +1122,7 @@ int tipc_link_build_ack_msg(struct tipc_link *l, struct sk_buff_head *xmitq)
/* Broadcast ACK must be sent via a unicast link => defer to caller */
if (link_is_bc_rcvlink(l)) {
- if (((l->rcv_nxt ^ link_own_addr(l)) & 0xf) != 0xf)
+ if (((l->rcv_nxt ^ tipc_own_addr(l->net)) & 0xf) != 0xf)
return 0;
l->rcv_unacked = 0;
return TIPC_LINK_SND_BC_ACK;
@@ -1082,38 +1230,34 @@ drop:
return rc;
}
-/*
- * Send protocol message to the other endpoint.
- */
-void tipc_link_proto_xmit(struct tipc_link *l, u32 msg_typ, int probe_msg,
- u32 gap, u32 tolerance, u32 priority)
-{
- struct sk_buff *skb = NULL;
- struct sk_buff_head xmitq;
-
- __skb_queue_head_init(&xmitq);
- tipc_link_build_proto_msg(l, msg_typ, probe_msg, gap,
- tolerance, priority, &xmitq);
- skb = __skb_dequeue(&xmitq);
- if (!skb)
- return;
- tipc_bearer_xmit_skb(l->net, l->bearer_id, skb, l->media_addr);
- l->rcv_unacked = 0;
-}
-
static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
u16 rcvgap, int tolerance, int priority,
struct sk_buff_head *xmitq)
{
- struct sk_buff *skb = NULL;
- struct tipc_msg *hdr = l->pmsg;
+ struct sk_buff *skb;
+ struct tipc_msg *hdr;
+ struct sk_buff_head *dfq = &l->deferdq;
bool node_up = link_is_up(l->bc_rcvlink);
/* Don't send protocol message during reset or link failover */
if (tipc_link_is_blocked(l))
return;
- msg_set_type(hdr, mtyp);
+ if (!tipc_link_is_up(l) && (mtyp == STATE_MSG))
+ return;
+
+ if (!skb_queue_empty(dfq))
+ rcvgap = buf_seqno(skb_peek(dfq)) - l->rcv_nxt;
+
+ skb = tipc_msg_create(LINK_PROTOCOL, mtyp, INT_H_SIZE,
+ TIPC_MAX_IF_NAME, l->addr,
+ tipc_own_addr(l->net), 0, 0, 0);
+ if (!skb)
+ return;
+
+ hdr = buf_msg(skb);
+ msg_set_session(hdr, l->session);
+ msg_set_bearer_id(hdr, l->bearer_id);
msg_set_net_plane(hdr, l->net_plane);
msg_set_next_sent(hdr, l->snd_nxt);
msg_set_ack(hdr, l->rcv_nxt - 1);
@@ -1123,36 +1267,23 @@ static void tipc_link_build_proto_msg(struct tipc_link *l, int mtyp, bool probe,
msg_set_linkprio(hdr, priority);
msg_set_redundant_link(hdr, node_up);
msg_set_seq_gap(hdr, 0);
-
- /* Compatibility: created msg must not be in sequence with pkt flow */
msg_set_seqno(hdr, l->snd_nxt + U16_MAX / 2);
if (mtyp == STATE_MSG) {
- if (!tipc_link_is_up(l))
- return;
-
- /* Override rcvgap if there are packets in deferred queue */
- if (!skb_queue_empty(&l->deferdq))
- rcvgap = buf_seqno(skb_peek(&l->deferdq)) - l->rcv_nxt;
- if (rcvgap) {
- msg_set_seq_gap(hdr, rcvgap);
- l->stats.sent_nacks++;
- }
+ msg_set_seq_gap(hdr, rcvgap);
+ msg_set_size(hdr, INT_H_SIZE);
msg_set_probe(hdr, probe);
- if (probe)
- l->stats.sent_probes++;
l->stats.sent_states++;
l->rcv_unacked = 0;
} else {
/* RESET_MSG or ACTIVATE_MSG */
msg_set_max_pkt(hdr, l->advertised_mtu);
- msg_set_ack(hdr, l->rcv_nxt - 1);
- msg_set_next_sent(hdr, 1);
+ strcpy(msg_data(hdr), l->if_name);
}
- skb = tipc_buf_acquire(msg_size(hdr));
- if (!skb)
- return;
- skb_copy_to_linear_data(skb, hdr, msg_size(hdr));
+ if (probe)
+ l->stats.sent_probes++;
+ if (rcvgap)
+ l->stats.sent_nacks++;
skb->priority = TC_PRIO_CONTROL;
__skb_queue_tail(xmitq, skb);
}
@@ -1177,7 +1308,7 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
/* At least one packet required for safe algorithm => add dummy */
skb = tipc_msg_create(TIPC_LOW_IMPORTANCE, TIPC_DIRECT_MSG,
- BASIC_H_SIZE, 0, l->addr, link_own_addr(l),
+ BASIC_H_SIZE, 0, l->addr, tipc_own_addr(l->net),
0, 0, TIPC_ERR_NO_PORT);
if (!skb) {
pr_warn("%sunable to create tunnel packet\n", link_co_err);
@@ -1188,7 +1319,7 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
__skb_queue_purge(&tmpxq);
/* Initialize reusable tunnel packet header */
- tipc_msg_init(link_own_addr(l), &tnlhdr, TUNNEL_PROTOCOL,
+ tipc_msg_init(tipc_own_addr(l->net), &tnlhdr, TUNNEL_PROTOCOL,
mtyp, INT_H_SIZE, l->addr);
pktcnt = skb_queue_len(&l->transmq) + skb_queue_len(&l->backlogq);
msg_set_msgcnt(&tnlhdr, pktcnt);
@@ -1247,7 +1378,7 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
if (tipc_link_is_blocked(l) || !xmitq)
goto exit;
- if (link_own_addr(l) > msg_prevnode(hdr))
+ if (tipc_own_addr(l->net) > msg_prevnode(hdr))
l->net_plane = msg_net_plane(hdr);
switch (mtyp) {
@@ -1255,11 +1386,13 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
/* Ignore duplicate RESET with old session number */
if ((less_eq(msg_session(hdr), l->peer_session)) &&
- (l->peer_session != WILDCARD_SESSION))
+ (l->peer_session != ANY_SESSION))
break;
/* fall thru' */
case ACTIVATE_MSG:
+ skb_linearize(skb);
+ hdr = buf_msg(skb);
/* Complete own link name with peer's interface name */
if_name = strrchr(l->name, ':') + 1;
@@ -1297,6 +1430,12 @@ static int tipc_link_proto_rcv(struct tipc_link *l, struct sk_buff *skb,
if (in_range(peers_tol, TIPC_MIN_LINK_TOL, TIPC_MAX_LINK_TOL))
l->tolerance = peers_tol;
+ if (peers_prio && in_range(peers_prio, TIPC_MIN_LINK_PRI,
+ TIPC_MAX_LINK_PRI)) {
+ l->priority = peers_prio;
+ rc = tipc_link_fsm_evt(l, LINK_FAILURE_EVT);
+ }
+
l->silent_intv_cnt = 0;
l->stats.recv_states++;
if (msg_probe(hdr))
@@ -1344,7 +1483,7 @@ static bool tipc_link_build_bc_proto_msg(struct tipc_link *l, bool bcast,
u16 gap_to = peers_snd_nxt - 1;
skb = tipc_msg_create(BCAST_PROTOCOL, STATE_MSG, INT_H_SIZE,
- 0, l->addr, link_own_addr(l), 0, 0, 0);
+ 0, l->addr, tipc_own_addr(l->net), 0, 0, 0);
if (!skb)
return false;
hdr = buf_msg(skb);
@@ -1499,7 +1638,7 @@ int tipc_link_bc_nack_rcv(struct tipc_link *l, struct sk_buff *skb,
if (mtyp != STATE_MSG)
return 0;
- if (dnode == link_own_addr(l)) {
+ if (dnode == tipc_own_addr(l->net)) {
tipc_link_bc_ack_rcv(l, acked, xmitq);
rc = tipc_link_retrans(l->bc_sndlink, from, to, xmitq);
l->stats.recv_nacks++;
@@ -1525,53 +1664,17 @@ void tipc_link_set_queue_limits(struct tipc_link *l, u32 win)
l->backlog[TIPC_SYSTEM_IMPORTANCE].limit = max_bulk;
}
-/* tipc_link_find_owner - locate owner node of link by link's name
- * @net: the applicable net namespace
- * @name: pointer to link name string
- * @bearer_id: pointer to index in 'node->links' array where the link was found.
- *
- * Returns pointer to node owning the link, or 0 if no matching link is found.
- */
-static struct tipc_node *tipc_link_find_owner(struct net *net,
- const char *link_name,
- unsigned int *bearer_id)
-{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_link *l_ptr;
- struct tipc_node *n_ptr;
- struct tipc_node *found_node = NULL;
- int i;
-
- *bearer_id = 0;
- rcu_read_lock();
- list_for_each_entry_rcu(n_ptr, &tn->node_list, list) {
- tipc_node_lock(n_ptr);
- for (i = 0; i < MAX_BEARERS; i++) {
- l_ptr = n_ptr->links[i].link;
- if (l_ptr && !strcmp(l_ptr->name, link_name)) {
- *bearer_id = i;
- found_node = n_ptr;
- break;
- }
- }
- tipc_node_unlock(n_ptr);
- if (found_node)
- break;
- }
- rcu_read_unlock();
-
- return found_node;
-}
-
/**
- * link_reset_statistics - reset link statistics
- * @l_ptr: pointer to link
+ * link_reset_stats - reset link statistics
+ * @l: pointer to link
*/
-static void link_reset_statistics(struct tipc_link *l_ptr)
+void tipc_link_reset_stats(struct tipc_link *l)
{
- memset(&l_ptr->stats, 0, sizeof(l_ptr->stats));
- l_ptr->stats.sent_info = l_ptr->snd_nxt;
- l_ptr->stats.recv_info = l_ptr->rcv_nxt;
+ memset(&l->stats, 0, sizeof(l->stats));
+ if (!link_is_bc_sndlink(l)) {
+ l->stats.sent_info = l->snd_nxt;
+ l->stats.recv_info = l->rcv_nxt;
+ }
}
static void link_print(struct tipc_link *l, const char *str)
@@ -1624,84 +1727,6 @@ int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[])
return 0;
}
-int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info)
-{
- int err;
- int res = 0;
- int bearer_id;
- char *name;
- struct tipc_link *link;
- struct tipc_node *node;
- struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1];
- struct net *net = sock_net(skb->sk);
-
- if (!info->attrs[TIPC_NLA_LINK])
- return -EINVAL;
-
- err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX,
- info->attrs[TIPC_NLA_LINK],
- tipc_nl_link_policy);
- if (err)
- return err;
-
- if (!attrs[TIPC_NLA_LINK_NAME])
- return -EINVAL;
-
- name = nla_data(attrs[TIPC_NLA_LINK_NAME]);
-
- if (strcmp(name, tipc_bclink_name) == 0)
- return tipc_nl_bc_link_set(net, attrs);
-
- node = tipc_link_find_owner(net, name, &bearer_id);
- if (!node)
- return -EINVAL;
-
- tipc_node_lock(node);
-
- link = node->links[bearer_id].link;
- if (!link) {
- res = -EINVAL;
- goto out;
- }
-
- if (attrs[TIPC_NLA_LINK_PROP]) {
- struct nlattr *props[TIPC_NLA_PROP_MAX + 1];
-
- err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP],
- props);
- if (err) {
- res = err;
- goto out;
- }
-
- if (props[TIPC_NLA_PROP_TOL]) {
- u32 tol;
-
- tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
- link->tolerance = tol;
- tipc_link_proto_xmit(link, STATE_MSG, 0, 0, tol, 0);
- }
- if (props[TIPC_NLA_PROP_PRIO]) {
- u32 prio;
-
- prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
- link->priority = prio;
- tipc_link_proto_xmit(link, STATE_MSG, 0, 0, 0, prio);
- }
- if (props[TIPC_NLA_PROP_WIN]) {
- u32 win;
-
- win = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
- tipc_link_set_queue_limits(link, win);
- }
- }
-
-out:
- tipc_node_unlock(node);
-
- return res;
-}
-
static int __tipc_nl_add_stats(struct sk_buff *skb, struct tipc_stats *s)
{
int i;
@@ -1768,8 +1793,8 @@ msg_full:
}
/* Caller should hold appropriate locks to protect the link */
-static int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
- struct tipc_link *link, int nlflags)
+int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
+ struct tipc_link *link, int nlflags)
{
int err;
void *hdr;
@@ -1838,198 +1863,138 @@ msg_full:
return -EMSGSIZE;
}
-/* Caller should hold node lock */
-static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg,
- struct tipc_node *node, u32 *prev_link)
+static int __tipc_nl_add_bc_link_stat(struct sk_buff *skb,
+ struct tipc_stats *stats)
{
- u32 i;
- int err;
-
- for (i = *prev_link; i < MAX_BEARERS; i++) {
- *prev_link = i;
-
- if (!node->links[i].link)
- continue;
+ int i;
+ struct nlattr *nest;
- err = __tipc_nl_add_link(net, msg,
- node->links[i].link, NLM_F_MULTI);
- if (err)
- return err;
- }
- *prev_link = 0;
+ struct nla_map {
+ __u32 key;
+ __u32 val;
+ };
- return 0;
-}
+ struct nla_map map[] = {
+ {TIPC_NLA_STATS_RX_INFO, stats->recv_info},
+ {TIPC_NLA_STATS_RX_FRAGMENTS, stats->recv_fragments},
+ {TIPC_NLA_STATS_RX_FRAGMENTED, stats->recv_fragmented},
+ {TIPC_NLA_STATS_RX_BUNDLES, stats->recv_bundles},
+ {TIPC_NLA_STATS_RX_BUNDLED, stats->recv_bundled},
+ {TIPC_NLA_STATS_TX_INFO, stats->sent_info},
+ {TIPC_NLA_STATS_TX_FRAGMENTS, stats->sent_fragments},
+ {TIPC_NLA_STATS_TX_FRAGMENTED, stats->sent_fragmented},
+ {TIPC_NLA_STATS_TX_BUNDLES, stats->sent_bundles},
+ {TIPC_NLA_STATS_TX_BUNDLED, stats->sent_bundled},
+ {TIPC_NLA_STATS_RX_NACKS, stats->recv_nacks},
+ {TIPC_NLA_STATS_RX_DEFERRED, stats->deferred_recv},
+ {TIPC_NLA_STATS_TX_NACKS, stats->sent_nacks},
+ {TIPC_NLA_STATS_TX_ACKS, stats->sent_acks},
+ {TIPC_NLA_STATS_RETRANSMITTED, stats->retransmitted},
+ {TIPC_NLA_STATS_DUPLICATES, stats->duplicates},
+ {TIPC_NLA_STATS_LINK_CONGS, stats->link_congs},
+ {TIPC_NLA_STATS_MAX_QUEUE, stats->max_queue_sz},
+ {TIPC_NLA_STATS_AVG_QUEUE, stats->queue_sz_counts ?
+ (stats->accu_queue_sz / stats->queue_sz_counts) : 0}
+ };
-int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb)
-{
- struct net *net = sock_net(skb->sk);
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_node *node;
- struct tipc_nl_msg msg;
- u32 prev_node = cb->args[0];
- u32 prev_link = cb->args[1];
- int done = cb->args[2];
- int err;
+ nest = nla_nest_start(skb, TIPC_NLA_LINK_STATS);
+ if (!nest)
+ return -EMSGSIZE;
- if (done)
- return 0;
+ for (i = 0; i < ARRAY_SIZE(map); i++)
+ if (nla_put_u32(skb, map[i].key, map[i].val))
+ goto msg_full;
- msg.skb = skb;
- msg.portid = NETLINK_CB(cb->skb).portid;
- msg.seq = cb->nlh->nlmsg_seq;
-
- rcu_read_lock();
- if (prev_node) {
- node = tipc_node_find(net, prev_node);
- if (!node) {
- /* We never set seq or call nl_dump_check_consistent()
- * this means that setting prev_seq here will cause the
- * consistence check to fail in the netlink callback
- * handler. Resulting in the last NLMSG_DONE message
- * having the NLM_F_DUMP_INTR flag set.
- */
- cb->prev_seq = 1;
- goto out;
- }
- tipc_node_put(node);
-
- list_for_each_entry_continue_rcu(node, &tn->node_list,
- list) {
- tipc_node_lock(node);
- err = __tipc_nl_add_node_links(net, &msg, node,
- &prev_link);
- tipc_node_unlock(node);
- if (err)
- goto out;
-
- prev_node = node->addr;
- }
- } else {
- err = tipc_nl_add_bc_link(net, &msg);
- if (err)
- goto out;
-
- list_for_each_entry_rcu(node, &tn->node_list, list) {
- tipc_node_lock(node);
- err = __tipc_nl_add_node_links(net, &msg, node,
- &prev_link);
- tipc_node_unlock(node);
- if (err)
- goto out;
-
- prev_node = node->addr;
- }
- }
- done = 1;
-out:
- rcu_read_unlock();
+ nla_nest_end(skb, nest);
- cb->args[0] = prev_node;
- cb->args[1] = prev_link;
- cb->args[2] = done;
+ return 0;
+msg_full:
+ nla_nest_cancel(skb, nest);
- return skb->len;
+ return -EMSGSIZE;
}
-int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info)
+int tipc_nl_add_bc_link(struct net *net, struct tipc_nl_msg *msg)
{
- struct net *net = genl_info_net(info);
- struct tipc_nl_msg msg;
- char *name;
int err;
+ void *hdr;
+ struct nlattr *attrs;
+ struct nlattr *prop;
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_link *bcl = tn->bcl;
- msg.portid = info->snd_portid;
- msg.seq = info->snd_seq;
-
- if (!info->attrs[TIPC_NLA_LINK_NAME])
- return -EINVAL;
- name = nla_data(info->attrs[TIPC_NLA_LINK_NAME]);
+ if (!bcl)
+ return 0;
- msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
- if (!msg.skb)
- return -ENOMEM;
+ tipc_bcast_lock(net);
- if (strcmp(name, tipc_bclink_name) == 0) {
- err = tipc_nl_add_bc_link(net, &msg);
- if (err) {
- nlmsg_free(msg.skb);
- return err;
- }
- } else {
- int bearer_id;
- struct tipc_node *node;
- struct tipc_link *link;
+ hdr = genlmsg_put(msg->skb, msg->portid, msg->seq, &tipc_genl_family,
+ NLM_F_MULTI, TIPC_NL_LINK_GET);
+ if (!hdr) {
+ tipc_bcast_unlock(net);
+ return -EMSGSIZE;
+ }
- node = tipc_link_find_owner(net, name, &bearer_id);
- if (!node)
- return -EINVAL;
+ attrs = nla_nest_start(msg->skb, TIPC_NLA_LINK);
+ if (!attrs)
+ goto msg_full;
- tipc_node_lock(node);
- link = node->links[bearer_id].link;
- if (!link) {
- tipc_node_unlock(node);
- nlmsg_free(msg.skb);
- return -EINVAL;
- }
+ /* The broadcast link is always up */
+ if (nla_put_flag(msg->skb, TIPC_NLA_LINK_UP))
+ goto attr_msg_full;
- err = __tipc_nl_add_link(net, &msg, link, 0);
- tipc_node_unlock(node);
- if (err) {
- nlmsg_free(msg.skb);
- return err;
- }
- }
+ if (nla_put_flag(msg->skb, TIPC_NLA_LINK_BROADCAST))
+ goto attr_msg_full;
+ if (nla_put_string(msg->skb, TIPC_NLA_LINK_NAME, bcl->name))
+ goto attr_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_RX, bcl->rcv_nxt))
+ goto attr_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_LINK_TX, bcl->snd_nxt))
+ goto attr_msg_full;
- return genlmsg_reply(msg.skb, info);
-}
+ prop = nla_nest_start(msg->skb, TIPC_NLA_LINK_PROP);
+ if (!prop)
+ goto attr_msg_full;
+ if (nla_put_u32(msg->skb, TIPC_NLA_PROP_WIN, bcl->window))
+ goto prop_msg_full;
+ nla_nest_end(msg->skb, prop);
-int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info)
-{
- int err;
- char *link_name;
- unsigned int bearer_id;
- struct tipc_link *link;
- struct tipc_node *node;
- struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1];
- struct net *net = sock_net(skb->sk);
-
- if (!info->attrs[TIPC_NLA_LINK])
- return -EINVAL;
-
- err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX,
- info->attrs[TIPC_NLA_LINK],
- tipc_nl_link_policy);
+ err = __tipc_nl_add_bc_link_stat(msg->skb, &bcl->stats);
if (err)
- return err;
-
- if (!attrs[TIPC_NLA_LINK_NAME])
- return -EINVAL;
-
- link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]);
+ goto attr_msg_full;
- if (strcmp(link_name, tipc_bclink_name) == 0) {
- err = tipc_bclink_reset_stats(net);
- if (err)
- return err;
- return 0;
- }
+ tipc_bcast_unlock(net);
+ nla_nest_end(msg->skb, attrs);
+ genlmsg_end(msg->skb, hdr);
- node = tipc_link_find_owner(net, link_name, &bearer_id);
- if (!node)
- return -EINVAL;
+ return 0;
- tipc_node_lock(node);
+prop_msg_full:
+ nla_nest_cancel(msg->skb, prop);
+attr_msg_full:
+ nla_nest_cancel(msg->skb, attrs);
+msg_full:
+ tipc_bcast_unlock(net);
+ genlmsg_cancel(msg->skb, hdr);
- link = node->links[bearer_id].link;
- if (!link) {
- tipc_node_unlock(node);
- return -EINVAL;
- }
+ return -EMSGSIZE;
+}
- link_reset_statistics(link);
+void tipc_link_set_tolerance(struct tipc_link *l, u32 tol,
+ struct sk_buff_head *xmitq)
+{
+ l->tolerance = tol;
+ tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, tol, 0, xmitq);
+}
- tipc_node_unlock(node);
+void tipc_link_set_prio(struct tipc_link *l, u32 prio,
+ struct sk_buff_head *xmitq)
+{
+ l->priority = prio;
+ tipc_link_build_proto_msg(l, STATE_MSG, 0, 0, 0, prio, xmitq);
+}
- return 0;
+void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit)
+{
+ l->abort_limit = limit;
}
diff --git a/net/tipc/link.h b/net/tipc/link.h
index 66d859b66c84..6a94175ee20a 100644
--- a/net/tipc/link.h
+++ b/net/tipc/link.h
@@ -45,10 +45,6 @@
*/
#define ELINKCONG EAGAIN /* link congestion <=> resource unavailable */
-/* Out-of-range value for link sequence numbers
- */
-#define INVALID_LINK_SEQ 0x10000
-
/* Link FSM events:
*/
enum {
@@ -75,151 +71,6 @@ enum {
*/
#define MAX_PKT_DEFAULT 1500
-struct tipc_stats {
- u32 sent_info; /* used in counting # sent packets */
- u32 recv_info; /* used in counting # recv'd packets */
- u32 sent_states;
- u32 recv_states;
- u32 sent_probes;
- u32 recv_probes;
- u32 sent_nacks;
- u32 recv_nacks;
- u32 sent_acks;
- u32 sent_bundled;
- u32 sent_bundles;
- u32 recv_bundled;
- u32 recv_bundles;
- u32 retransmitted;
- u32 sent_fragmented;
- u32 sent_fragments;
- u32 recv_fragmented;
- u32 recv_fragments;
- u32 link_congs; /* # port sends blocked by congestion */
- u32 deferred_recv;
- u32 duplicates;
- u32 max_queue_sz; /* send queue size high water mark */
- u32 accu_queue_sz; /* used for send queue size profiling */
- u32 queue_sz_counts; /* used for send queue size profiling */
- u32 msg_length_counts; /* used for message length profiling */
- u32 msg_lengths_total; /* used for message length profiling */
- u32 msg_length_profile[7]; /* used for msg. length profiling */
-};
-
-/**
- * struct tipc_link - TIPC link data structure
- * @addr: network address of link's peer node
- * @name: link name character string
- * @media_addr: media address to use when sending messages over link
- * @timer: link timer
- * @net: pointer to namespace struct
- * @refcnt: reference counter for permanent references (owner node & timer)
- * @peer_session: link session # being used by peer end of link
- * @peer_bearer_id: bearer id used by link's peer endpoint
- * @bearer_id: local bearer id used by link
- * @tolerance: minimum link continuity loss needed to reset link [in ms]
- * @keepalive_intv: link keepalive timer interval
- * @abort_limit: # of unacknowledged continuity probes needed to reset link
- * @state: current state of link FSM
- * @peer_caps: bitmap describing capabilities of peer node
- * @silent_intv_cnt: # of timer intervals without any reception from peer
- * @proto_msg: template for control messages generated by link
- * @pmsg: convenience pointer to "proto_msg" field
- * @priority: current link priority
- * @net_plane: current link network plane ('A' through 'H')
- * @backlog_limit: backlog queue congestion thresholds (indexed by importance)
- * @exp_msg_count: # of tunnelled messages expected during link changeover
- * @reset_rcv_checkpt: seq # of last acknowledged message at time of link reset
- * @mtu: current maximum packet size for this link
- * @advertised_mtu: advertised own mtu when link is being established
- * @transmitq: queue for sent, non-acked messages
- * @backlogq: queue for messages waiting to be sent
- * @snt_nxt: next sequence number to use for outbound messages
- * @last_retransmitted: sequence number of most recently retransmitted message
- * @stale_count: # of identical retransmit requests made by peer
- * @ackers: # of peers that needs to ack each packet before it can be released
- * @acked: # last packet acked by a certain peer. Used for broadcast.
- * @rcv_nxt: next sequence number to expect for inbound messages
- * @deferred_queue: deferred queue saved OOS b'cast message received from node
- * @unacked_window: # of inbound messages rx'd without ack'ing back to peer
- * @inputq: buffer queue for messages to be delivered upwards
- * @namedq: buffer queue for name table messages to be delivered upwards
- * @next_out: ptr to first unsent outbound message in queue
- * @wakeupq: linked list of wakeup msgs waiting for link congestion to abate
- * @long_msg_seq_no: next identifier to use for outbound fragmented messages
- * @reasm_buf: head of partially reassembled inbound message fragments
- * @bc_rcvr: marks that this is a broadcast receiver link
- * @stats: collects statistics regarding link activity
- */
-struct tipc_link {
- u32 addr;
- char name[TIPC_MAX_LINK_NAME];
- struct tipc_media_addr *media_addr;
- struct net *net;
-
- /* Management and link supervision data */
- u32 peer_session;
- u32 peer_bearer_id;
- u32 bearer_id;
- u32 tolerance;
- unsigned long keepalive_intv;
- u32 abort_limit;
- u32 state;
- u16 peer_caps;
- bool active;
- u32 silent_intv_cnt;
- struct {
- unchar hdr[INT_H_SIZE];
- unchar body[TIPC_MAX_IF_NAME];
- } proto_msg;
- struct tipc_msg *pmsg;
- u32 priority;
- char net_plane;
-
- /* Failover/synch */
- u16 drop_point;
- struct sk_buff *failover_reasm_skb;
-
- /* Max packet negotiation */
- u16 mtu;
- u16 advertised_mtu;
-
- /* Sending */
- struct sk_buff_head transmq;
- struct sk_buff_head backlogq;
- struct {
- u16 len;
- u16 limit;
- } backlog[5];
- u16 snd_nxt;
- u16 last_retransm;
- u16 window;
- u32 stale_count;
-
- /* Reception */
- u16 rcv_nxt;
- u32 rcv_unacked;
- struct sk_buff_head deferdq;
- struct sk_buff_head *inputq;
- struct sk_buff_head *namedq;
-
- /* Congestion handling */
- struct sk_buff_head wakeupq;
-
- /* Fragmentation/reassembly */
- struct sk_buff *reasm_buf;
-
- /* Broadcast */
- u16 ackers;
- u16 acked;
- struct tipc_link *bc_rcvlink;
- struct tipc_link *bc_sndlink;
- int nack_state;
- bool bc_peer_is_up;
-
- /* Statistics */
- struct tipc_stats stats;
-};
-
bool tipc_link_create(struct net *net, char *if_name, int bearer_id,
int tolerance, char net_plane, u32 mtu, int priority,
int window, u32 session, u32 ownnode, u32 peer,
@@ -239,7 +90,6 @@ void tipc_link_tnl_prepare(struct tipc_link *l, struct tipc_link *tnl,
int mtyp, struct sk_buff_head *xmitq);
void tipc_link_build_reset_msg(struct tipc_link *l, struct sk_buff_head *xmitq);
int tipc_link_fsm_evt(struct tipc_link *l, int evt);
-void tipc_link_reset_fragments(struct tipc_link *l_ptr);
bool tipc_link_is_up(struct tipc_link *l);
bool tipc_link_peer_is_down(struct tipc_link *l);
bool tipc_link_is_reset(struct tipc_link *l);
@@ -248,15 +98,27 @@ bool tipc_link_is_synching(struct tipc_link *l);
bool tipc_link_is_failingover(struct tipc_link *l);
bool tipc_link_is_blocked(struct tipc_link *l);
void tipc_link_set_active(struct tipc_link *l, bool active);
-void tipc_link_reset(struct tipc_link *l_ptr);
-int tipc_link_xmit(struct tipc_link *link, struct sk_buff_head *list,
+void tipc_link_reset(struct tipc_link *l);
+void tipc_link_reset_stats(struct tipc_link *l);
+int tipc_link_xmit(struct tipc_link *link, struct sk_buff_head *list,
struct sk_buff_head *xmitq);
+struct sk_buff_head *tipc_link_inputq(struct tipc_link *l);
+u16 tipc_link_rcv_nxt(struct tipc_link *l);
+u16 tipc_link_acked(struct tipc_link *l);
+u32 tipc_link_id(struct tipc_link *l);
+char *tipc_link_name(struct tipc_link *l);
+char tipc_link_plane(struct tipc_link *l);
+int tipc_link_prio(struct tipc_link *l);
+int tipc_link_window(struct tipc_link *l);
+unsigned long tipc_link_tolerance(struct tipc_link *l);
+void tipc_link_set_tolerance(struct tipc_link *l, u32 tol,
+ struct sk_buff_head *xmitq);
+void tipc_link_set_prio(struct tipc_link *l, u32 prio,
+ struct sk_buff_head *xmitq);
+void tipc_link_set_abort_limit(struct tipc_link *l, u32 limit);
void tipc_link_set_queue_limits(struct tipc_link *l, u32 window);
-
-int tipc_nl_link_dump(struct sk_buff *skb, struct netlink_callback *cb);
-int tipc_nl_link_get(struct sk_buff *skb, struct genl_info *info);
-int tipc_nl_link_set(struct sk_buff *skb, struct genl_info *info);
-int tipc_nl_link_reset_stats(struct sk_buff *skb, struct genl_info *info);
+int __tipc_nl_add_link(struct net *net, struct tipc_nl_msg *msg,
+ struct tipc_link *link, int nlflags);
int tipc_nl_parse_link_prop(struct nlattr *prop, struct nlattr *props[]);
int tipc_link_timeout(struct tipc_link *l, struct sk_buff_head *xmitq);
int tipc_link_rcv(struct tipc_link *l, struct sk_buff *skb,
diff --git a/net/tipc/name_distr.c b/net/tipc/name_distr.c
index c07612bab95c..ebe9d0ff6e9e 100644
--- a/net/tipc/name_distr.c
+++ b/net/tipc/name_distr.c
@@ -84,31 +84,6 @@ static struct sk_buff *named_prepare_buf(struct net *net, u32 type, u32 size,
return buf;
}
-void named_cluster_distribute(struct net *net, struct sk_buff *skb)
-{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct sk_buff *oskb;
- struct tipc_node *node;
- u32 dnode;
-
- rcu_read_lock();
- list_for_each_entry_rcu(node, &tn->node_list, list) {
- dnode = node->addr;
- if (in_own_node(net, dnode))
- continue;
- if (!tipc_node_is_up(node))
- continue;
- oskb = pskb_copy(skb, GFP_ATOMIC);
- if (!oskb)
- break;
- msg_set_destnode(buf_msg(oskb), dnode);
- tipc_node_xmit_skb(net, oskb, dnode, 0);
- }
- rcu_read_unlock();
-
- kfree_skb(skb);
-}
-
/**
* tipc_named_publish - tell other nodes about a new publication by this node
*/
@@ -226,42 +201,6 @@ void tipc_named_node_up(struct net *net, u32 dnode)
tipc_node_xmit(net, &head, dnode, 0);
}
-static void tipc_publ_subscribe(struct net *net, struct publication *publ,
- u32 addr)
-{
- struct tipc_node *node;
-
- if (in_own_node(net, addr))
- return;
-
- node = tipc_node_find(net, addr);
- if (!node) {
- pr_warn("Node subscription rejected, unknown node 0x%x\n",
- addr);
- return;
- }
-
- tipc_node_lock(node);
- list_add_tail(&publ->nodesub_list, &node->publ_list);
- tipc_node_unlock(node);
- tipc_node_put(node);
-}
-
-static void tipc_publ_unsubscribe(struct net *net, struct publication *publ,
- u32 addr)
-{
- struct tipc_node *node;
-
- node = tipc_node_find(net, addr);
- if (!node)
- return;
-
- tipc_node_lock(node);
- list_del_init(&publ->nodesub_list);
- tipc_node_unlock(node);
- tipc_node_put(node);
-}
-
/**
* tipc_publ_purge - remove publication associated with a failed node
*
@@ -277,7 +216,7 @@ static void tipc_publ_purge(struct net *net, struct publication *publ, u32 addr)
p = tipc_nametbl_remove_publ(net, publ->type, publ->lower,
publ->node, publ->ref, publ->key);
if (p)
- tipc_publ_unsubscribe(net, p, addr);
+ tipc_node_unsubscribe(net, &p->nodesub_list, addr);
spin_unlock_bh(&tn->nametbl_lock);
if (p != publ) {
@@ -317,7 +256,7 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i,
TIPC_CLUSTER_SCOPE, node,
ntohl(i->ref), ntohl(i->key));
if (publ) {
- tipc_publ_subscribe(net, publ, node);
+ tipc_node_subscribe(net, &publ->nodesub_list, node);
return true;
}
} else if (dtype == WITHDRAWAL) {
@@ -326,7 +265,7 @@ static bool tipc_update_nametbl(struct net *net, struct distr_item *i,
node, ntohl(i->ref),
ntohl(i->key));
if (publ) {
- tipc_publ_unsubscribe(net, publ, node);
+ tipc_node_unsubscribe(net, &publ->nodesub_list, node);
kfree_rcu(publ, rcu);
return true;
}
@@ -397,6 +336,7 @@ void tipc_named_rcv(struct net *net, struct sk_buff_head *inputq)
spin_lock_bh(&tn->nametbl_lock);
for (skb = skb_dequeue(inputq); skb; skb = skb_dequeue(inputq)) {
+ skb_linearize(skb);
msg = buf_msg(skb);
mtype = msg_type(msg);
item = (struct distr_item *)msg_data(msg);
diff --git a/net/tipc/name_distr.h b/net/tipc/name_distr.h
index dd2d9fd80da2..1264ba0af937 100644
--- a/net/tipc/name_distr.h
+++ b/net/tipc/name_distr.h
@@ -69,7 +69,6 @@ struct distr_item {
struct sk_buff *tipc_named_publish(struct net *net, struct publication *publ);
struct sk_buff *tipc_named_withdraw(struct net *net, struct publication *publ);
-void named_cluster_distribute(struct net *net, struct sk_buff *buf);
void tipc_named_node_up(struct net *net, u32 dnode);
void tipc_named_rcv(struct net *net, struct sk_buff_head *msg_queue);
void tipc_named_reinit(struct net *net);
diff --git a/net/tipc/name_table.c b/net/tipc/name_table.c
index 0f47f08bf38f..e190460fe0d3 100644
--- a/net/tipc/name_table.c
+++ b/net/tipc/name_table.c
@@ -42,16 +42,11 @@
#include "subscr.h"
#include "bcast.h"
#include "addr.h"
+#include "node.h"
#include <net/genetlink.h>
#define TIPC_NAMETBL_SIZE 1024 /* must be a power of 2 */
-static const struct nla_policy
-tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = {
- [TIPC_NLA_NAME_TABLE_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_NAME_TABLE_PUBL] = { .type = NLA_NESTED }
-};
-
/**
* struct name_info - name sequence publication info
* @node_list: circular list of publications made by own node
@@ -417,6 +412,9 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq,
struct tipc_subscription *s)
{
struct sub_seq *sseq = nseq->sseqs;
+ struct tipc_name_seq ns;
+
+ tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns);
list_add(&s->nameseq_list, &nseq->subscriptions);
@@ -424,7 +422,7 @@ static void tipc_nameseq_subscribe(struct name_seq *nseq,
return;
while (sseq != &nseq->sseqs[nseq->first_free]) {
- if (tipc_subscrp_check_overlap(s, sseq->lower, sseq->upper)) {
+ if (tipc_subscrp_check_overlap(&ns, sseq->lower, sseq->upper)) {
struct publication *crs;
struct name_info *info = sseq->info;
int must_report = 1;
@@ -677,7 +675,7 @@ struct publication *tipc_nametbl_publish(struct net *net, u32 type, u32 lower,
spin_unlock_bh(&tn->nametbl_lock);
if (buf)
- named_cluster_distribute(net, buf);
+ tipc_node_broadcast(net, buf);
return publ;
}
@@ -709,7 +707,7 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref,
spin_unlock_bh(&tn->nametbl_lock);
if (skb) {
- named_cluster_distribute(net, skb);
+ tipc_node_broadcast(net, skb);
return 1;
}
return 0;
@@ -721,9 +719,10 @@ int tipc_nametbl_withdraw(struct net *net, u32 type, u32 lower, u32 ref,
void tipc_nametbl_subscribe(struct tipc_subscription *s)
{
struct tipc_net *tn = net_generic(s->net, tipc_net_id);
- u32 type = s->seq.type;
+ u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap);
int index = hash(type);
struct name_seq *seq;
+ struct tipc_name_seq ns;
spin_lock_bh(&tn->nametbl_lock);
seq = nametbl_find_seq(s->net, type);
@@ -734,8 +733,9 @@ void tipc_nametbl_subscribe(struct tipc_subscription *s)
tipc_nameseq_subscribe(seq, s);
spin_unlock_bh(&seq->lock);
} else {
+ tipc_subscrp_convert_seq(&s->evt.s.seq, s->swap, &ns);
pr_warn("Failed to create subscription for {%u,%u,%u}\n",
- s->seq.type, s->seq.lower, s->seq.upper);
+ ns.type, ns.lower, ns.upper);
}
spin_unlock_bh(&tn->nametbl_lock);
}
@@ -747,9 +747,10 @@ void tipc_nametbl_unsubscribe(struct tipc_subscription *s)
{
struct tipc_net *tn = net_generic(s->net, tipc_net_id);
struct name_seq *seq;
+ u32 type = tipc_subscrp_convert_seq_type(s->evt.s.seq.type, s->swap);
spin_lock_bh(&tn->nametbl_lock);
- seq = nametbl_find_seq(s->net, s->seq.type);
+ seq = nametbl_find_seq(s->net, type);
if (seq != NULL) {
spin_lock_bh(&seq->lock);
list_del_init(&s->nameseq_list);
diff --git a/net/tipc/net.c b/net/tipc/net.c
index 77bf9113c7a7..28bf4feeb81c 100644
--- a/net/tipc/net.c
+++ b/net/tipc/net.c
@@ -41,11 +41,7 @@
#include "socket.h"
#include "node.h"
#include "bcast.h"
-
-static const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = {
- [TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_NET_ID] = { .type = NLA_U32 }
-};
+#include "netlink.h"
/*
* The TIPC locking policy is designed to ensure a very fine locking
@@ -116,7 +112,6 @@ int tipc_net_start(struct net *net, u32 addr)
tn->own_addr = addr;
tipc_named_reinit(net);
tipc_sk_reinit(net);
- tipc_bcast_reinit(net);
tipc_nametbl_publish(net, TIPC_CFG_SRV, tn->own_addr, tn->own_addr,
TIPC_ZONE_SCOPE, 0, tn->own_addr);
diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
index 7f6475efc984..56935df2167a 100644
--- a/net/tipc/netlink.c
+++ b/net/tipc/netlink.c
@@ -55,6 +55,75 @@ static const struct nla_policy tipc_nl_policy[TIPC_NLA_MAX + 1] = {
[TIPC_NLA_NAME_TABLE] = { .type = NLA_NESTED, }
};
+const struct nla_policy
+tipc_nl_name_table_policy[TIPC_NLA_NAME_TABLE_MAX + 1] = {
+ [TIPC_NLA_NAME_TABLE_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_NAME_TABLE_PUBL] = { .type = NLA_NESTED }
+};
+
+const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = {
+ [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_SOCK_ADDR] = { .type = NLA_U32 },
+ [TIPC_NLA_SOCK_REF] = { .type = NLA_U32 },
+ [TIPC_NLA_SOCK_CON] = { .type = NLA_NESTED },
+ [TIPC_NLA_SOCK_HAS_PUBL] = { .type = NLA_FLAG }
+};
+
+const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = {
+ [TIPC_NLA_NET_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_NET_ID] = { .type = NLA_U32 }
+};
+
+const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {
+ [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_LINK_NAME] = { .type = NLA_STRING,
+ .len = TIPC_MAX_LINK_NAME },
+ [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 },
+ [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG },
+ [TIPC_NLA_LINK_UP] = { .type = NLA_FLAG },
+ [TIPC_NLA_LINK_ACTIVE] = { .type = NLA_FLAG },
+ [TIPC_NLA_LINK_PROP] = { .type = NLA_NESTED },
+ [TIPC_NLA_LINK_STATS] = { .type = NLA_NESTED },
+ [TIPC_NLA_LINK_RX] = { .type = NLA_U32 },
+ [TIPC_NLA_LINK_TX] = { .type = NLA_U32 }
+};
+
+const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = {
+ [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 },
+ [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG }
+};
+
+/* Properties valid for media, bearer and link */
+const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = {
+ [TIPC_NLA_PROP_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_PROP_PRIO] = { .type = NLA_U32 },
+ [TIPC_NLA_PROP_TOL] = { .type = NLA_U32 },
+ [TIPC_NLA_PROP_WIN] = { .type = NLA_U32 }
+};
+
+const struct nla_policy tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = {
+ [TIPC_NLA_BEARER_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_BEARER_NAME] = { .type = NLA_STRING,
+ .len = TIPC_MAX_BEARER_NAME },
+ [TIPC_NLA_BEARER_PROP] = { .type = NLA_NESTED },
+ [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 }
+};
+
+const struct nla_policy tipc_nl_media_policy[TIPC_NLA_MEDIA_MAX + 1] = {
+ [TIPC_NLA_MEDIA_UNSPEC] = { .type = NLA_UNSPEC },
+ [TIPC_NLA_MEDIA_NAME] = { .type = NLA_STRING },
+ [TIPC_NLA_MEDIA_PROP] = { .type = NLA_NESTED }
+};
+
+const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {
+ [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC},
+ [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY,
+ .len = sizeof(struct sockaddr_storage)},
+ [TIPC_NLA_UDP_REMOTE] = {.type = NLA_BINARY,
+ .len = sizeof(struct sockaddr_storage)},
+};
+
/* Users of the legacy API (tipc-config) can't handle that we add operations,
* so we have a separate genl handling for the new API.
*/
@@ -101,18 +170,18 @@ static const struct genl_ops tipc_genl_v2_ops[] = {
},
{
.cmd = TIPC_NL_LINK_GET,
- .doit = tipc_nl_link_get,
- .dumpit = tipc_nl_link_dump,
+ .doit = tipc_nl_node_get_link,
+ .dumpit = tipc_nl_node_dump_link,
.policy = tipc_nl_policy,
},
{
.cmd = TIPC_NL_LINK_SET,
- .doit = tipc_nl_link_set,
+ .doit = tipc_nl_node_set_link,
.policy = tipc_nl_policy,
},
{
.cmd = TIPC_NL_LINK_RESET_STATS,
- .doit = tipc_nl_link_reset_stats,
+ .doit = tipc_nl_node_reset_link_stats,
.policy = tipc_nl_policy,
},
{
diff --git a/net/tipc/netlink.h b/net/tipc/netlink.h
index 08a1db67b927..ed1dbcb4afbd 100644
--- a/net/tipc/netlink.h
+++ b/net/tipc/netlink.h
@@ -35,6 +35,7 @@
#ifndef _TIPC_NETLINK_H
#define _TIPC_NETLINK_H
+#include <net/netlink.h>
extern struct genl_family tipc_genl_family;
int tipc_nlmsg_parse(const struct nlmsghdr *nlh, struct nlattr ***buf);
@@ -45,6 +46,16 @@ struct tipc_nl_msg {
u32 seq;
};
+extern const struct nla_policy tipc_nl_name_table_policy[];
+extern const struct nla_policy tipc_nl_sock_policy[];
+extern const struct nla_policy tipc_nl_net_policy[];
+extern const struct nla_policy tipc_nl_link_policy[];
+extern const struct nla_policy tipc_nl_node_policy[];
+extern const struct nla_policy tipc_nl_prop_policy[];
+extern const struct nla_policy tipc_nl_bearer_policy[];
+extern const struct nla_policy tipc_nl_media_policy[];
+extern const struct nla_policy tipc_nl_udp_policy[];
+
int tipc_netlink_start(void);
int tipc_netlink_compat_start(void);
void tipc_netlink_stop(void);
diff --git a/net/tipc/netlink_compat.c b/net/tipc/netlink_compat.c
index 1eadc95e1132..d7d050f44fc1 100644
--- a/net/tipc/netlink_compat.c
+++ b/net/tipc/netlink_compat.c
@@ -1023,25 +1023,25 @@ static int tipc_nl_compat_handle(struct tipc_nl_compat_msg *msg)
msg->req_type = TIPC_TLV_LINK_NAME;
msg->rep_size = ULTRA_STRING_MAX_LEN;
msg->rep_type = TIPC_TLV_ULTRA_STRING;
- dump.dumpit = tipc_nl_link_dump;
+ dump.dumpit = tipc_nl_node_dump_link;
dump.format = tipc_nl_compat_link_stat_dump;
return tipc_nl_compat_dumpit(&dump, msg);
case TIPC_CMD_GET_LINKS:
msg->req_type = TIPC_TLV_NET_ADDR;
msg->rep_size = ULTRA_STRING_MAX_LEN;
- dump.dumpit = tipc_nl_link_dump;
+ dump.dumpit = tipc_nl_node_dump_link;
dump.format = tipc_nl_compat_link_dump;
return tipc_nl_compat_dumpit(&dump, msg);
case TIPC_CMD_SET_LINK_TOL:
case TIPC_CMD_SET_LINK_PRI:
case TIPC_CMD_SET_LINK_WINDOW:
msg->req_type = TIPC_TLV_LINK_CONFIG;
- doit.doit = tipc_nl_link_set;
+ doit.doit = tipc_nl_node_set_link;
doit.transcode = tipc_nl_compat_link_set;
return tipc_nl_compat_doit(&doit, msg);
case TIPC_CMD_RESET_LINK_STATS:
msg->req_type = TIPC_TLV_LINK_NAME;
- doit.doit = tipc_nl_link_reset_stats;
+ doit.doit = tipc_nl_node_reset_link_stats;
doit.transcode = tipc_nl_compat_link_reset_stats;
return tipc_nl_compat_doit(&doit, msg);
case TIPC_CMD_SHOW_NAME_TABLE:
@@ -1104,8 +1104,8 @@ static int tipc_nl_compat_recv(struct sk_buff *skb, struct genl_info *info)
req_nlh = (struct nlmsghdr *)skb->data;
msg.req = nlmsg_data(req_nlh) + GENL_HDRLEN + TIPC_GENL_HDRLEN;
msg.cmd = req_userhdr->cmd;
- msg.dst_sk = info->dst_sk;
msg.net = genl_info_net(info);
+ msg.dst_sk = skb->sk;
if ((msg.cmd & 0xC000) && (!netlink_net_capable(skb, CAP_NET_ADMIN))) {
msg.rep = tipc_get_err_tlv(TIPC_CFG_NOT_NET_ADMIN);
diff --git a/net/tipc/node.c b/net/tipc/node.c
index 20cddec0a43c..ace178fd3850 100644
--- a/net/tipc/node.c
+++ b/net/tipc/node.c
@@ -41,6 +41,85 @@
#include "socket.h"
#include "bcast.h"
#include "discover.h"
+#include "netlink.h"
+
+#define INVALID_NODE_SIG 0x10000
+
+/* Flags used to take different actions according to flag type
+ * TIPC_NOTIFY_NODE_DOWN: notify node is down
+ * TIPC_NOTIFY_NODE_UP: notify node is up
+ * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type
+ */
+enum {
+ TIPC_NOTIFY_NODE_DOWN = (1 << 3),
+ TIPC_NOTIFY_NODE_UP = (1 << 4),
+ TIPC_NOTIFY_LINK_UP = (1 << 6),
+ TIPC_NOTIFY_LINK_DOWN = (1 << 7)
+};
+
+struct tipc_link_entry {
+ struct tipc_link *link;
+ spinlock_t lock; /* per link */
+ u32 mtu;
+ struct sk_buff_head inputq;
+ struct tipc_media_addr maddr;
+};
+
+struct tipc_bclink_entry {
+ struct tipc_link *link;
+ struct sk_buff_head inputq1;
+ struct sk_buff_head arrvq;
+ struct sk_buff_head inputq2;
+ struct sk_buff_head namedq;
+};
+
+/**
+ * struct tipc_node - TIPC node structure
+ * @addr: network address of node
+ * @ref: reference counter to node object
+ * @lock: rwlock governing access to structure
+ * @net: the applicable net namespace
+ * @hash: links to adjacent nodes in unsorted hash chain
+ * @inputq: pointer to input queue containing messages for msg event
+ * @namedq: pointer to name table input queue with name table messages
+ * @active_links: bearer ids of active links, used as index into links[] array
+ * @links: array containing references to all links to node
+ * @action_flags: bit mask of different types of node actions
+ * @state: connectivity state vs peer node
+ * @sync_point: sequence number where synch/failover is finished
+ * @list: links to adjacent nodes in sorted list of cluster's nodes
+ * @working_links: number of working links to node (both active and standby)
+ * @link_cnt: number of links to node
+ * @capabilities: bitmap, indicating peer node's functional capabilities
+ * @signature: node instance identifier
+ * @link_id: local and remote bearer ids of changing link, if any
+ * @publ_list: list of publications
+ * @rcu: rcu struct for tipc_node
+ */
+struct tipc_node {
+ u32 addr;
+ struct kref kref;
+ rwlock_t lock;
+ struct net *net;
+ struct hlist_node hash;
+ int active_links[2];
+ struct tipc_link_entry links[MAX_BEARERS];
+ struct tipc_bclink_entry bc_entry;
+ int action_flags;
+ struct list_head list;
+ int state;
+ u16 sync_point;
+ int link_cnt;
+ u16 working_links;
+ u16 capabilities;
+ u32 signature;
+ u32 link_id;
+ struct list_head publ_list;
+ struct list_head conn_sks;
+ unsigned long keepalive_intv;
+ struct timer_list timer;
+ struct rcu_head rcu;
+};
/* Node FSM states and events:
*/
@@ -75,6 +154,9 @@ static void node_lost_contact(struct tipc_node *n, struct sk_buff_head *inputq);
static void tipc_node_delete(struct tipc_node *node);
static void tipc_node_timeout(unsigned long data);
static void tipc_node_fsm_evt(struct tipc_node *n, int evt);
+static struct tipc_node *tipc_node_find(struct net *net, u32 addr);
+static void tipc_node_put(struct tipc_node *node);
+static bool tipc_node_is_up(struct tipc_node *n);
struct tipc_sock_conn {
u32 port;
@@ -83,12 +165,32 @@ struct tipc_sock_conn {
struct list_head list;
};
-static const struct nla_policy tipc_nl_node_policy[TIPC_NLA_NODE_MAX + 1] = {
- [TIPC_NLA_NODE_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_NODE_ADDR] = { .type = NLA_U32 },
- [TIPC_NLA_NODE_UP] = { .type = NLA_FLAG }
-};
+static struct tipc_link *node_active_link(struct tipc_node *n, int sel)
+{
+ int bearer_id = n->active_links[sel & 1];
+
+ if (unlikely(bearer_id == INVALID_BEARER_ID))
+ return NULL;
+
+ return n->links[bearer_id].link;
+}
+int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel)
+{
+ struct tipc_node *n;
+ int bearer_id;
+ unsigned int mtu = MAX_MSG_SIZE;
+
+ n = tipc_node_find(net, addr);
+ if (unlikely(!n))
+ return mtu;
+
+ bearer_id = n->active_links[sel & 1];
+ if (likely(bearer_id != INVALID_BEARER_ID))
+ mtu = n->links[bearer_id].mtu;
+ tipc_node_put(n);
+ return mtu;
+}
/*
* A trivial power-of-two bitmask technique is used for speed, since this
* operation is done for every incoming TIPC packet. The number of hash table
@@ -102,12 +204,13 @@ static unsigned int tipc_hashfn(u32 addr)
static void tipc_node_kref_release(struct kref *kref)
{
- struct tipc_node *node = container_of(kref, struct tipc_node, kref);
+ struct tipc_node *n = container_of(kref, struct tipc_node, kref);
- tipc_node_delete(node);
+ kfree(n->bc_entry.link);
+ kfree_rcu(n, rcu);
}
-void tipc_node_put(struct tipc_node *node)
+static void tipc_node_put(struct tipc_node *node)
{
kref_put(&node->kref, tipc_node_kref_release);
}
@@ -120,87 +223,143 @@ static void tipc_node_get(struct tipc_node *node)
/*
* tipc_node_find - locate specified node object, if it exists
*/
-struct tipc_node *tipc_node_find(struct net *net, u32 addr)
+static struct tipc_node *tipc_node_find(struct net *net, u32 addr)
{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_net *tn = tipc_net(net);
struct tipc_node *node;
+ unsigned int thash = tipc_hashfn(addr);
if (unlikely(!in_own_cluster_exact(net, addr)))
return NULL;
rcu_read_lock();
- hlist_for_each_entry_rcu(node, &tn->node_htable[tipc_hashfn(addr)],
- hash) {
- if (node->addr == addr) {
- tipc_node_get(node);
- rcu_read_unlock();
- return node;
- }
+ hlist_for_each_entry_rcu(node, &tn->node_htable[thash], hash) {
+ if (node->addr != addr)
+ continue;
+ if (!kref_get_unless_zero(&node->kref))
+ node = NULL;
+ break;
}
rcu_read_unlock();
- return NULL;
+ return node;
+}
+
+static void tipc_node_read_lock(struct tipc_node *n)
+{
+ read_lock_bh(&n->lock);
+}
+
+static void tipc_node_read_unlock(struct tipc_node *n)
+{
+ read_unlock_bh(&n->lock);
+}
+
+static void tipc_node_write_lock(struct tipc_node *n)
+{
+ write_lock_bh(&n->lock);
+}
+
+static void tipc_node_write_unlock(struct tipc_node *n)
+{
+ struct net *net = n->net;
+ u32 addr = 0;
+ u32 flags = n->action_flags;
+ u32 link_id = 0;
+ struct list_head *publ_list;
+
+ if (likely(!flags)) {
+ write_unlock_bh(&n->lock);
+ return;
+ }
+
+ addr = n->addr;
+ link_id = n->link_id;
+ publ_list = &n->publ_list;
+
+ n->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP |
+ TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP);
+
+ write_unlock_bh(&n->lock);
+
+ if (flags & TIPC_NOTIFY_NODE_DOWN)
+ tipc_publ_notify(net, publ_list, addr);
+
+ if (flags & TIPC_NOTIFY_NODE_UP)
+ tipc_named_node_up(net, addr);
+
+ if (flags & TIPC_NOTIFY_LINK_UP)
+ tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr,
+ TIPC_NODE_SCOPE, link_id, addr);
+
+ if (flags & TIPC_NOTIFY_LINK_DOWN)
+ tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,
+ link_id, addr);
}
struct tipc_node *tipc_node_create(struct net *net, u32 addr, u16 capabilities)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
- struct tipc_node *n_ptr, *temp_node;
+ struct tipc_node *n, *temp_node;
+ int i;
spin_lock_bh(&tn->node_list_lock);
- n_ptr = tipc_node_find(net, addr);
- if (n_ptr)
+ n = tipc_node_find(net, addr);
+ if (n)
goto exit;
- n_ptr = kzalloc(sizeof(*n_ptr), GFP_ATOMIC);
- if (!n_ptr) {
+ n = kzalloc(sizeof(*n), GFP_ATOMIC);
+ if (!n) {
pr_warn("Node creation failed, no memory\n");
goto exit;
}
- n_ptr->addr = addr;
- n_ptr->net = net;
- n_ptr->capabilities = capabilities;
- kref_init(&n_ptr->kref);
- spin_lock_init(&n_ptr->lock);
- INIT_HLIST_NODE(&n_ptr->hash);
- INIT_LIST_HEAD(&n_ptr->list);
- INIT_LIST_HEAD(&n_ptr->publ_list);
- INIT_LIST_HEAD(&n_ptr->conn_sks);
- skb_queue_head_init(&n_ptr->bc_entry.namedq);
- skb_queue_head_init(&n_ptr->bc_entry.inputq1);
- __skb_queue_head_init(&n_ptr->bc_entry.arrvq);
- skb_queue_head_init(&n_ptr->bc_entry.inputq2);
- hlist_add_head_rcu(&n_ptr->hash, &tn->node_htable[tipc_hashfn(addr)]);
- list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
- if (n_ptr->addr < temp_node->addr)
- break;
- }
- list_add_tail_rcu(&n_ptr->list, &temp_node->list);
- n_ptr->state = SELF_DOWN_PEER_LEAVING;
- n_ptr->signature = INVALID_NODE_SIG;
- n_ptr->active_links[0] = INVALID_BEARER_ID;
- n_ptr->active_links[1] = INVALID_BEARER_ID;
- if (!tipc_link_bc_create(net, tipc_own_addr(net), n_ptr->addr,
- U16_MAX, tipc_bc_sndlink(net)->window,
- n_ptr->capabilities,
- &n_ptr->bc_entry.inputq1,
- &n_ptr->bc_entry.namedq,
+ n->addr = addr;
+ n->net = net;
+ n->capabilities = capabilities;
+ kref_init(&n->kref);
+ rwlock_init(&n->lock);
+ INIT_HLIST_NODE(&n->hash);
+ INIT_LIST_HEAD(&n->list);
+ INIT_LIST_HEAD(&n->publ_list);
+ INIT_LIST_HEAD(&n->conn_sks);
+ skb_queue_head_init(&n->bc_entry.namedq);
+ skb_queue_head_init(&n->bc_entry.inputq1);
+ __skb_queue_head_init(&n->bc_entry.arrvq);
+ skb_queue_head_init(&n->bc_entry.inputq2);
+ for (i = 0; i < MAX_BEARERS; i++)
+ spin_lock_init(&n->links[i].lock);
+ n->state = SELF_DOWN_PEER_LEAVING;
+ n->signature = INVALID_NODE_SIG;
+ n->active_links[0] = INVALID_BEARER_ID;
+ n->active_links[1] = INVALID_BEARER_ID;
+ if (!tipc_link_bc_create(net, tipc_own_addr(net), n->addr,
+ U16_MAX,
+ tipc_link_window(tipc_bc_sndlink(net)),
+ n->capabilities,
+ &n->bc_entry.inputq1,
+ &n->bc_entry.namedq,
tipc_bc_sndlink(net),
- &n_ptr->bc_entry.link)) {
+ &n->bc_entry.link)) {
pr_warn("Broadcast rcv link creation failed, no memory\n");
- kfree(n_ptr);
- n_ptr = NULL;
+ kfree(n);
+ n = NULL;
goto exit;
}
- tipc_node_get(n_ptr);
- setup_timer(&n_ptr->timer, tipc_node_timeout, (unsigned long)n_ptr);
- n_ptr->keepalive_intv = U32_MAX;
+ tipc_node_get(n);
+ setup_timer(&n->timer, tipc_node_timeout, (unsigned long)n);
+ n->keepalive_intv = U32_MAX;
+ hlist_add_head_rcu(&n->hash, &tn->node_htable[tipc_hashfn(addr)]);
+ list_for_each_entry_rcu(temp_node, &tn->node_list, list) {
+ if (n->addr < temp_node->addr)
+ break;
+ }
+ list_add_tail_rcu(&n->list, &temp_node->list);
exit:
spin_unlock_bh(&tn->node_list_lock);
- return n_ptr;
+ return n;
}
static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l)
{
- unsigned long tol = l->tolerance;
+ unsigned long tol = tipc_link_tolerance(l);
unsigned long intv = ((tol / 4) > 500) ? 500 : tol / 4;
unsigned long keepalive_intv = msecs_to_jiffies(intv);
@@ -209,31 +368,66 @@ static void tipc_node_calculate_timer(struct tipc_node *n, struct tipc_link *l)
n->keepalive_intv = keepalive_intv;
/* Ensure link's abort limit corresponds to current interval */
- l->abort_limit = l->tolerance / jiffies_to_msecs(n->keepalive_intv);
+ tipc_link_set_abort_limit(l, tol / jiffies_to_msecs(n->keepalive_intv));
}
static void tipc_node_delete(struct tipc_node *node)
{
list_del_rcu(&node->list);
hlist_del_rcu(&node->hash);
- kfree(node->bc_entry.link);
- kfree_rcu(node, rcu);
+ tipc_node_put(node);
+
+ del_timer_sync(&node->timer);
+ tipc_node_put(node);
}
void tipc_node_stop(struct net *net)
{
- struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_net *tn = tipc_net(net);
struct tipc_node *node, *t_node;
spin_lock_bh(&tn->node_list_lock);
- list_for_each_entry_safe(node, t_node, &tn->node_list, list) {
- if (del_timer(&node->timer))
- tipc_node_put(node);
- tipc_node_put(node);
- }
+ list_for_each_entry_safe(node, t_node, &tn->node_list, list)
+ tipc_node_delete(node);
spin_unlock_bh(&tn->node_list_lock);
}
+void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr)
+{
+ struct tipc_node *n;
+
+ if (in_own_node(net, addr))
+ return;
+
+ n = tipc_node_find(net, addr);
+ if (!n) {
+ pr_warn("Node subscribe rejected, unknown node 0x%x\n", addr);
+ return;
+ }
+ tipc_node_write_lock(n);
+ list_add_tail(subscr, &n->publ_list);
+ tipc_node_write_unlock(n);
+ tipc_node_put(n);
+}
+
+void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr)
+{
+ struct tipc_node *n;
+
+ if (in_own_node(net, addr))
+ return;
+
+ n = tipc_node_find(net, addr);
+ if (!n) {
+ pr_warn("Node unsubscribe rejected, unknown node 0x%x\n", addr);
+ return;
+ }
+ tipc_node_write_lock(n);
+ list_del_init(subscr);
+ tipc_node_write_unlock(n);
+ tipc_node_put(n);
+}
+
int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port)
{
struct tipc_node *node;
@@ -257,9 +451,9 @@ int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port)
conn->port = port;
conn->peer_port = peer_port;
- tipc_node_lock(node);
+ tipc_node_write_lock(node);
list_add_tail(&conn->list, &node->conn_sks);
- tipc_node_unlock(node);
+ tipc_node_write_unlock(node);
exit:
tipc_node_put(node);
return err;
@@ -277,14 +471,14 @@ void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port)
if (!node)
return;
- tipc_node_lock(node);
+ tipc_node_write_lock(node);
list_for_each_entry_safe(conn, safe, &node->conn_sks, list) {
if (port != conn->port)
continue;
list_del(&conn->list);
kfree(conn);
}
- tipc_node_unlock(node);
+ tipc_node_write_unlock(node);
tipc_node_put(node);
}
@@ -301,21 +495,21 @@ static void tipc_node_timeout(unsigned long data)
__skb_queue_head_init(&xmitq);
for (bearer_id = 0; bearer_id < MAX_BEARERS; bearer_id++) {
- tipc_node_lock(n);
+ tipc_node_read_lock(n);
le = &n->links[bearer_id];
+ spin_lock_bh(&le->lock);
if (le->link) {
/* Link tolerance may change asynchronously: */
tipc_node_calculate_timer(n, le->link);
rc = tipc_link_timeout(le->link, &xmitq);
}
- tipc_node_unlock(n);
+ spin_unlock_bh(&le->lock);
+ tipc_node_read_unlock(n);
tipc_bearer_xmit(n->net, bearer_id, &xmitq, &le->maddr);
if (rc & TIPC_LINK_DOWN_EVT)
tipc_node_link_down(n, bearer_id, false);
}
- if (!mod_timer(&n->timer, jiffies + n->keepalive_intv))
- tipc_node_get(n);
- tipc_node_put(n);
+ mod_timer(&n->timer, jiffies + n->keepalive_intv);
}
/**
@@ -340,16 +534,16 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id,
n->working_links++;
n->action_flags |= TIPC_NOTIFY_LINK_UP;
- n->link_id = nl->peer_bearer_id << 16 | bearer_id;
+ n->link_id = tipc_link_id(nl);
/* Leave room for tunnel header when returning 'mtu' to users: */
- n->links[bearer_id].mtu = nl->mtu - INT_H_SIZE;
+ n->links[bearer_id].mtu = tipc_link_mtu(nl) - INT_H_SIZE;
tipc_bearer_add_dest(n->net, bearer_id, n->addr);
tipc_bcast_inc_bearer_dst_cnt(n->net, bearer_id);
pr_debug("Established link <%s> on network plane %c\n",
- nl->name, nl->net_plane);
+ tipc_link_name(nl), tipc_link_plane(nl));
/* First link? => give it both slots */
if (!ol) {
@@ -362,17 +556,17 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id,
}
/* Second link => redistribute slots */
- if (nl->priority > ol->priority) {
- pr_debug("Old link <%s> becomes standby\n", ol->name);
+ if (tipc_link_prio(nl) > tipc_link_prio(ol)) {
+ pr_debug("Old link <%s> becomes standby\n", tipc_link_name(ol));
*slot0 = bearer_id;
*slot1 = bearer_id;
tipc_link_set_active(nl, true);
tipc_link_set_active(ol, false);
- } else if (nl->priority == ol->priority) {
+ } else if (tipc_link_prio(nl) == tipc_link_prio(ol)) {
tipc_link_set_active(nl, true);
*slot1 = bearer_id;
} else {
- pr_debug("New link <%s> is standby\n", nl->name);
+ pr_debug("New link <%s> is standby\n", tipc_link_name(nl));
}
/* Prepare synchronization with first link */
@@ -387,9 +581,9 @@ static void __tipc_node_link_up(struct tipc_node *n, int bearer_id,
static void tipc_node_link_up(struct tipc_node *n, int bearer_id,
struct sk_buff_head *xmitq)
{
- tipc_node_lock(n);
+ tipc_node_write_lock(n);
__tipc_node_link_up(n, bearer_id, xmitq);
- tipc_node_unlock(n);
+ tipc_node_write_unlock(n);
}
/**
@@ -402,7 +596,7 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id,
struct tipc_link_entry *le = &n->links[*bearer_id];
int *slot0 = &n->active_links[0];
int *slot1 = &n->active_links[1];
- int i, highest = 0;
+ int i, highest = 0, prio;
struct tipc_link *l, *_l, *tnl;
l = n->links[*bearer_id].link;
@@ -411,12 +605,12 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id,
n->working_links--;
n->action_flags |= TIPC_NOTIFY_LINK_DOWN;
- n->link_id = l->peer_bearer_id << 16 | *bearer_id;
+ n->link_id = tipc_link_id(l);
tipc_bearer_remove_dest(n->net, *bearer_id, n->addr);
pr_debug("Lost link <%s> on network plane %c\n",
- l->name, l->net_plane);
+ tipc_link_name(l), tipc_link_plane(l));
/* Select new active link if any available */
*slot0 = INVALID_BEARER_ID;
@@ -427,10 +621,11 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id,
continue;
if (_l == l)
continue;
- if (_l->priority < highest)
+ prio = tipc_link_prio(_l);
+ if (prio < highest)
continue;
- if (_l->priority > highest) {
- highest = _l->priority;
+ if (prio > highest) {
+ highest = prio;
*slot0 = i;
*slot1 = i;
continue;
@@ -453,17 +648,17 @@ static void __tipc_node_link_down(struct tipc_node *n, int *bearer_id,
tipc_bcast_dec_bearer_dst_cnt(n->net, *bearer_id);
/* There is still a working link => initiate failover */
- tnl = node_active_link(n, 0);
+ *bearer_id = n->active_links[0];
+ tnl = n->links[*bearer_id].link;
tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT);
tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT);
- n->sync_point = tnl->rcv_nxt + (U16_MAX / 2 - 1);
+ n->sync_point = tipc_link_rcv_nxt(tnl) + (U16_MAX / 2 - 1);
tipc_link_tnl_prepare(l, tnl, FAILOVER_MSG, xmitq);
tipc_link_reset(l);
tipc_link_fsm_evt(l, LINK_RESET_EVT);
tipc_link_fsm_evt(l, LINK_FAILOVER_BEGIN_EVT);
tipc_node_fsm_evt(n, NODE_FAILOVER_BEGIN_EVT);
- *maddr = &n->links[tnl->bearer_id].maddr;
- *bearer_id = tnl->bearer_id;
+ *maddr = &n->links[*bearer_id].maddr;
}
static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
@@ -478,7 +673,7 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
__skb_queue_head_init(&xmitq);
- tipc_node_lock(n);
+ tipc_node_write_lock(n);
if (!tipc_link_is_establishing(l)) {
__tipc_node_link_down(n, &bearer_id, &xmitq, &maddr);
if (delete) {
@@ -490,12 +685,12 @@ static void tipc_node_link_down(struct tipc_node *n, int bearer_id, bool delete)
/* Defuse pending tipc_node_link_up() */
tipc_link_fsm_evt(l, LINK_RESET_EVT);
}
- tipc_node_unlock(n);
+ tipc_node_write_unlock(n);
tipc_bearer_xmit(n->net, bearer_id, &xmitq, maddr);
tipc_sk_rcv(n->net, &le->inputq);
}
-bool tipc_node_is_up(struct tipc_node *n)
+static bool tipc_node_is_up(struct tipc_node *n)
{
return n->active_links[0] != INVALID_BEARER_ID;
}
@@ -523,7 +718,7 @@ void tipc_node_check_dest(struct net *net, u32 onode,
if (!n)
return;
- tipc_node_lock(n);
+ tipc_node_write_lock(n);
le = &n->links[b->identity];
@@ -626,8 +821,8 @@ void tipc_node_check_dest(struct net *net, u32 onode,
}
memcpy(&le->maddr, maddr, sizeof(*maddr));
exit:
- tipc_node_unlock(n);
- if (reset && !tipc_link_is_reset(l))
+ tipc_node_write_unlock(n);
+ if (reset && l && !tipc_link_is_reset(l))
tipc_node_link_down(n, b->identity, false);
tipc_node_put(n);
}
@@ -834,24 +1029,6 @@ illegal_evt:
pr_err("Illegal node fsm evt %x in state %x\n", evt, state);
}
-bool tipc_node_filter_pkt(struct tipc_node *n, struct tipc_msg *hdr)
-{
- int state = n->state;
-
- if (likely(state == SELF_UP_PEER_UP))
- return true;
-
- if (state == SELF_LEAVING_PEER_DOWN)
- return false;
-
- if (state == SELF_DOWN_PEER_LEAVING) {
- if (msg_peer_node_is_up(hdr))
- return false;
- }
-
- return true;
-}
-
static void node_lost_contact(struct tipc_node *n,
struct sk_buff_head *inputq)
{
@@ -913,56 +1090,18 @@ int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 addr,
if (bearer_id >= MAX_BEARERS)
goto exit;
- tipc_node_lock(node);
+ tipc_node_read_lock(node);
link = node->links[bearer_id].link;
if (link) {
- strncpy(linkname, link->name, len);
+ strncpy(linkname, tipc_link_name(link), len);
err = 0;
}
exit:
- tipc_node_unlock(node);
+ tipc_node_read_unlock(node);
tipc_node_put(node);
return err;
}
-void tipc_node_unlock(struct tipc_node *node)
-{
- struct net *net = node->net;
- u32 addr = 0;
- u32 flags = node->action_flags;
- u32 link_id = 0;
- struct list_head *publ_list;
-
- if (likely(!flags)) {
- spin_unlock_bh(&node->lock);
- return;
- }
-
- addr = node->addr;
- link_id = node->link_id;
- publ_list = &node->publ_list;
-
- node->action_flags &= ~(TIPC_NOTIFY_NODE_DOWN | TIPC_NOTIFY_NODE_UP |
- TIPC_NOTIFY_LINK_DOWN | TIPC_NOTIFY_LINK_UP);
-
- spin_unlock_bh(&node->lock);
-
- if (flags & TIPC_NOTIFY_NODE_DOWN)
- tipc_publ_notify(net, publ_list, addr);
-
- if (flags & TIPC_NOTIFY_NODE_UP)
- tipc_named_node_up(net, addr);
-
- if (flags & TIPC_NOTIFY_LINK_UP)
- tipc_nametbl_publish(net, TIPC_LINK_STATE, addr, addr,
- TIPC_NODE_SCOPE, link_id, addr);
-
- if (flags & TIPC_NOTIFY_LINK_DOWN)
- tipc_nametbl_withdraw(net, TIPC_LINK_STATE, addr,
- link_id, addr);
-
-}
-
/* Caller should hold node lock for the passed node */
static int __tipc_nl_add_node(struct tipc_nl_msg *msg, struct tipc_node *node)
{
@@ -997,20 +1136,6 @@ msg_full:
return -EMSGSIZE;
}
-static struct tipc_link *tipc_node_select_link(struct tipc_node *n, int sel,
- int *bearer_id,
- struct tipc_media_addr **maddr)
-{
- int id = n->active_links[sel & 1];
-
- if (unlikely(id < 0))
- return NULL;
-
- *bearer_id = id;
- *maddr = &n->links[id].maddr;
- return n->links[id].link;
-}
-
/**
* tipc_node_xmit() is the general link level function for message sending
* @net: the applicable net namespace
@@ -1018,38 +1143,51 @@ static struct tipc_link *tipc_node_select_link(struct tipc_node *n, int sel,
* @dnode: address of destination node
* @selector: a number used for deterministic link selection
* Consumes the buffer chain, except when returning -ELINKCONG
- * Returns 0 if success, otherwise errno: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE
+ * Returns 0 if success, otherwise: -ELINKCONG,-EHOSTUNREACH,-EMSGSIZE,-ENOBUF
*/
int tipc_node_xmit(struct net *net, struct sk_buff_head *list,
u32 dnode, int selector)
{
- struct tipc_link *l = NULL;
+ struct tipc_link_entry *le = NULL;
struct tipc_node *n;
struct sk_buff_head xmitq;
- struct tipc_media_addr *maddr;
int bearer_id;
- int rc = -EHOSTUNREACH;
+ int rc;
- __skb_queue_head_init(&xmitq);
- n = tipc_node_find(net, dnode);
- if (likely(n)) {
- tipc_node_lock(n);
- l = tipc_node_select_link(n, selector, &bearer_id, &maddr);
- if (likely(l))
- rc = tipc_link_xmit(l, list, &xmitq);
- tipc_node_unlock(n);
- if (unlikely(rc == -ENOBUFS))
- tipc_node_link_down(n, bearer_id, false);
- tipc_node_put(n);
- }
- if (likely(!rc)) {
- tipc_bearer_xmit(net, bearer_id, &xmitq, maddr);
- return 0;
- }
- if (likely(in_own_node(net, dnode))) {
+ if (in_own_node(net, dnode)) {
tipc_sk_rcv(net, list);
return 0;
}
+
+ n = tipc_node_find(net, dnode);
+ if (unlikely(!n)) {
+ skb_queue_purge(list);
+ return -EHOSTUNREACH;
+ }
+
+ tipc_node_read_lock(n);
+ bearer_id = n->active_links[selector & 1];
+ if (unlikely(bearer_id == INVALID_BEARER_ID)) {
+ tipc_node_read_unlock(n);
+ tipc_node_put(n);
+ skb_queue_purge(list);
+ return -EHOSTUNREACH;
+ }
+
+ __skb_queue_head_init(&xmitq);
+ le = &n->links[bearer_id];
+ spin_lock_bh(&le->lock);
+ rc = tipc_link_xmit(le->link, list, &xmitq);
+ spin_unlock_bh(&le->lock);
+ tipc_node_read_unlock(n);
+
+ if (likely(rc == 0))
+ tipc_bearer_xmit(net, bearer_id, &xmitq, &le->maddr);
+ else if (rc == -ENOBUFS)
+ tipc_node_link_down(n, bearer_id, false);
+
+ tipc_node_put(n);
+
return rc;
}
@@ -1075,6 +1213,30 @@ int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dnode,
return 0;
}
+void tipc_node_broadcast(struct net *net, struct sk_buff *skb)
+{
+ struct sk_buff *txskb;
+ struct tipc_node *n;
+ u32 dst;
+
+ rcu_read_lock();
+ list_for_each_entry_rcu(n, tipc_nodes(net), list) {
+ dst = n->addr;
+ if (in_own_node(net, dst))
+ continue;
+ if (!tipc_node_is_up(n))
+ continue;
+ txskb = pskb_copy(skb, GFP_ATOMIC);
+ if (!txskb)
+ break;
+ msg_set_destnode(buf_msg(txskb), dst);
+ tipc_node_xmit_skb(net, txskb, dst, 0);
+ }
+ rcu_read_unlock();
+
+ kfree_skb(skb);
+}
+
/**
* tipc_node_bc_rcv - process TIPC broadcast packet arriving from off-node
* @net: the applicable net namespace
@@ -1116,9 +1278,9 @@ static void tipc_node_bc_rcv(struct net *net, struct sk_buff *skb, int bearer_id
/* Broadcast ACKs are sent on a unicast link */
if (rc & TIPC_LINK_SND_BC_ACK) {
- tipc_node_lock(n);
+ tipc_node_read_lock(n);
tipc_link_build_ack_msg(le->link, &xmitq);
- tipc_node_unlock(n);
+ tipc_node_read_unlock(n);
}
if (!skb_queue_empty(&xmitq))
@@ -1151,30 +1313,30 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
u16 oseqno = msg_seqno(hdr);
u16 iseqno = msg_seqno(msg_get_wrapped(hdr));
u16 exp_pkts = msg_msgcnt(hdr);
- u16 rcv_nxt, syncpt, dlv_nxt;
+ u16 rcv_nxt, syncpt, dlv_nxt, inputq_len;
int state = n->state;
struct tipc_link *l, *tnl, *pl = NULL;
struct tipc_media_addr *maddr;
- int i, pb_id;
+ int pb_id;
l = n->links[bearer_id].link;
if (!l)
return false;
- rcv_nxt = l->rcv_nxt;
+ rcv_nxt = tipc_link_rcv_nxt(l);
if (likely((state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL)))
return true;
/* Find parallel link, if any */
- for (i = 0; i < MAX_BEARERS; i++) {
- if ((i != bearer_id) && n->links[i].link) {
- pl = n->links[i].link;
+ for (pb_id = 0; pb_id < MAX_BEARERS; pb_id++) {
+ if ((pb_id != bearer_id) && n->links[pb_id].link) {
+ pl = n->links[pb_id].link;
break;
}
}
- /* Update node accesibility if applicable */
+ /* Check and update node accesibility if applicable */
if (state == SELF_UP_PEER_COMING) {
if (!tipc_link_is_up(l))
return true;
@@ -1187,8 +1349,12 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
if (msg_peer_node_is_up(hdr))
return false;
tipc_node_fsm_evt(n, PEER_LOST_CONTACT_EVT);
+ return true;
}
+ if (state == SELF_LEAVING_PEER_DOWN)
+ return false;
+
/* Ignore duplicate packets */
if ((usr != LINK_PROTOCOL) && less(oseqno, rcv_nxt))
return true;
@@ -1197,9 +1363,9 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
if ((usr == TUNNEL_PROTOCOL) && (mtyp == FAILOVER_MSG)) {
syncpt = oseqno + exp_pkts - 1;
if (pl && tipc_link_is_up(pl)) {
- pb_id = pl->bearer_id;
__tipc_node_link_down(n, &pb_id, xmitq, &maddr);
- tipc_skb_queue_splice_tail_init(pl->inputq, l->inputq);
+ tipc_skb_queue_splice_tail_init(tipc_link_inputq(pl),
+ tipc_link_inputq(l));
}
/* If pkts arrive out of order, use lowest calculated syncpt */
if (less(syncpt, n->sync_point))
@@ -1232,19 +1398,18 @@ static bool tipc_node_check_state(struct tipc_node *n, struct sk_buff *skb,
tipc_link_fsm_evt(l, LINK_SYNCH_BEGIN_EVT);
tipc_node_fsm_evt(n, NODE_SYNCH_BEGIN_EVT);
}
- if (less(syncpt, n->sync_point))
- n->sync_point = syncpt;
}
/* Open tunnel link when parallel link reaches synch point */
- if ((n->state == NODE_SYNCHING) && tipc_link_is_synching(l)) {
+ if (n->state == NODE_SYNCHING) {
if (tipc_link_is_synching(l)) {
tnl = l;
} else {
tnl = pl;
pl = l;
}
- dlv_nxt = pl->rcv_nxt - mod(skb_queue_len(pl->inputq));
+ inputq_len = skb_queue_len(tipc_link_inputq(pl));
+ dlv_nxt = tipc_link_rcv_nxt(pl) - inputq_len;
if (more(dlv_nxt, n->sync_point)) {
tipc_link_fsm_evt(tnl, LINK_SYNCH_END_EVT);
tipc_node_fsm_evt(n, NODE_SYNCH_END_EVT);
@@ -1304,22 +1469,32 @@ void tipc_rcv(struct net *net, struct sk_buff *skb, struct tipc_bearer *b)
/* Ensure broadcast reception is in synch with peer's send state */
if (unlikely(usr == LINK_PROTOCOL))
tipc_bcast_sync_rcv(net, n->bc_entry.link, hdr);
- else if (unlikely(n->bc_entry.link->acked != bc_ack))
+ else if (unlikely(tipc_link_acked(n->bc_entry.link) != bc_ack))
tipc_bcast_ack_rcv(net, n->bc_entry.link, bc_ack);
- tipc_node_lock(n);
-
- /* Is reception permitted at the moment ? */
- if (!tipc_node_filter_pkt(n, hdr))
- goto unlock;
-
- /* Check and if necessary update node state */
- if (likely(tipc_node_check_state(n, skb, bearer_id, &xmitq))) {
- rc = tipc_link_rcv(le->link, skb, &xmitq);
- skb = NULL;
+ /* Receive packet directly if conditions permit */
+ tipc_node_read_lock(n);
+ if (likely((n->state == SELF_UP_PEER_UP) && (usr != TUNNEL_PROTOCOL))) {
+ spin_lock_bh(&le->lock);
+ if (le->link) {
+ rc = tipc_link_rcv(le->link, skb, &xmitq);
+ skb = NULL;
+ }
+ spin_unlock_bh(&le->lock);
+ }
+ tipc_node_read_unlock(n);
+
+ /* Check/update node state before receiving */
+ if (unlikely(skb)) {
+ tipc_node_write_lock(n);
+ if (tipc_node_check_state(n, skb, bearer_id, &xmitq)) {
+ if (le->link) {
+ rc = tipc_link_rcv(le->link, skb, &xmitq);
+ skb = NULL;
+ }
+ }
+ tipc_node_write_unlock(n);
}
-unlock:
- tipc_node_unlock(n);
if (unlikely(rc & TIPC_LINK_UP_EVT))
tipc_node_link_up(n, bearer_id, &xmitq);
@@ -1384,15 +1559,15 @@ int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb)
continue;
}
- tipc_node_lock(node);
+ tipc_node_read_lock(node);
err = __tipc_nl_add_node(&msg, node);
if (err) {
last_addr = node->addr;
- tipc_node_unlock(node);
+ tipc_node_read_unlock(node);
goto out;
}
- tipc_node_unlock(node);
+ tipc_node_read_unlock(node);
}
done = 1;
out:
@@ -1402,3 +1577,317 @@ out:
return skb->len;
}
+
+/* tipc_node_find_by_name - locate owner node of link by link's name
+ * @net: the applicable net namespace
+ * @name: pointer to link name string
+ * @bearer_id: pointer to index in 'node->links' array where the link was found.
+ *
+ * Returns pointer to node owning the link, or 0 if no matching link is found.
+ */
+static struct tipc_node *tipc_node_find_by_name(struct net *net,
+ const char *link_name,
+ unsigned int *bearer_id)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_link *l;
+ struct tipc_node *n;
+ struct tipc_node *found_node = NULL;
+ int i;
+
+ *bearer_id = 0;
+ rcu_read_lock();
+ list_for_each_entry_rcu(n, &tn->node_list, list) {
+ tipc_node_read_lock(n);
+ for (i = 0; i < MAX_BEARERS; i++) {
+ l = n->links[i].link;
+ if (l && !strcmp(tipc_link_name(l), link_name)) {
+ *bearer_id = i;
+ found_node = n;
+ break;
+ }
+ }
+ tipc_node_read_unlock(n);
+ if (found_node)
+ break;
+ }
+ rcu_read_unlock();
+
+ return found_node;
+}
+
+int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info)
+{
+ int err;
+ int res = 0;
+ int bearer_id;
+ char *name;
+ struct tipc_link *link;
+ struct tipc_node *node;
+ struct sk_buff_head xmitq;
+ struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+
+ __skb_queue_head_init(&xmitq);
+
+ if (!info->attrs[TIPC_NLA_LINK])
+ return -EINVAL;
+
+ err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX,
+ info->attrs[TIPC_NLA_LINK],
+ tipc_nl_link_policy);
+ if (err)
+ return err;
+
+ if (!attrs[TIPC_NLA_LINK_NAME])
+ return -EINVAL;
+
+ name = nla_data(attrs[TIPC_NLA_LINK_NAME]);
+
+ if (strcmp(name, tipc_bclink_name) == 0)
+ return tipc_nl_bc_link_set(net, attrs);
+
+ node = tipc_node_find_by_name(net, name, &bearer_id);
+ if (!node)
+ return -EINVAL;
+
+ tipc_node_read_lock(node);
+
+ link = node->links[bearer_id].link;
+ if (!link) {
+ res = -EINVAL;
+ goto out;
+ }
+
+ if (attrs[TIPC_NLA_LINK_PROP]) {
+ struct nlattr *props[TIPC_NLA_PROP_MAX + 1];
+
+ err = tipc_nl_parse_link_prop(attrs[TIPC_NLA_LINK_PROP],
+ props);
+ if (err) {
+ res = err;
+ goto out;
+ }
+
+ if (props[TIPC_NLA_PROP_TOL]) {
+ u32 tol;
+
+ tol = nla_get_u32(props[TIPC_NLA_PROP_TOL]);
+ tipc_link_set_tolerance(link, tol, &xmitq);
+ }
+ if (props[TIPC_NLA_PROP_PRIO]) {
+ u32 prio;
+
+ prio = nla_get_u32(props[TIPC_NLA_PROP_PRIO]);
+ tipc_link_set_prio(link, prio, &xmitq);
+ }
+ if (props[TIPC_NLA_PROP_WIN]) {
+ u32 win;
+
+ win = nla_get_u32(props[TIPC_NLA_PROP_WIN]);
+ tipc_link_set_queue_limits(link, win);
+ }
+ }
+
+out:
+ tipc_node_read_unlock(node);
+ tipc_bearer_xmit(net, bearer_id, &xmitq, &node->links[bearer_id].maddr);
+ return res;
+}
+
+int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info)
+{
+ struct net *net = genl_info_net(info);
+ struct tipc_nl_msg msg;
+ char *name;
+ int err;
+
+ msg.portid = info->snd_portid;
+ msg.seq = info->snd_seq;
+
+ if (!info->attrs[TIPC_NLA_LINK_NAME])
+ return -EINVAL;
+ name = nla_data(info->attrs[TIPC_NLA_LINK_NAME]);
+
+ msg.skb = nlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+ if (!msg.skb)
+ return -ENOMEM;
+
+ if (strcmp(name, tipc_bclink_name) == 0) {
+ err = tipc_nl_add_bc_link(net, &msg);
+ if (err) {
+ nlmsg_free(msg.skb);
+ return err;
+ }
+ } else {
+ int bearer_id;
+ struct tipc_node *node;
+ struct tipc_link *link;
+
+ node = tipc_node_find_by_name(net, name, &bearer_id);
+ if (!node)
+ return -EINVAL;
+
+ tipc_node_read_lock(node);
+ link = node->links[bearer_id].link;
+ if (!link) {
+ tipc_node_read_unlock(node);
+ nlmsg_free(msg.skb);
+ return -EINVAL;
+ }
+
+ err = __tipc_nl_add_link(net, &msg, link, 0);
+ tipc_node_read_unlock(node);
+ if (err) {
+ nlmsg_free(msg.skb);
+ return err;
+ }
+ }
+
+ return genlmsg_reply(msg.skb, info);
+}
+
+int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info)
+{
+ int err;
+ char *link_name;
+ unsigned int bearer_id;
+ struct tipc_link *link;
+ struct tipc_node *node;
+ struct nlattr *attrs[TIPC_NLA_LINK_MAX + 1];
+ struct net *net = sock_net(skb->sk);
+ struct tipc_link_entry *le;
+
+ if (!info->attrs[TIPC_NLA_LINK])
+ return -EINVAL;
+
+ err = nla_parse_nested(attrs, TIPC_NLA_LINK_MAX,
+ info->attrs[TIPC_NLA_LINK],
+ tipc_nl_link_policy);
+ if (err)
+ return err;
+
+ if (!attrs[TIPC_NLA_LINK_NAME])
+ return -EINVAL;
+
+ link_name = nla_data(attrs[TIPC_NLA_LINK_NAME]);
+
+ if (strcmp(link_name, tipc_bclink_name) == 0) {
+ err = tipc_bclink_reset_stats(net);
+ if (err)
+ return err;
+ return 0;
+ }
+
+ node = tipc_node_find_by_name(net, link_name, &bearer_id);
+ if (!node)
+ return -EINVAL;
+
+ le = &node->links[bearer_id];
+ tipc_node_read_lock(node);
+ spin_lock_bh(&le->lock);
+ link = node->links[bearer_id].link;
+ if (!link) {
+ spin_unlock_bh(&le->lock);
+ tipc_node_read_unlock(node);
+ return -EINVAL;
+ }
+ tipc_link_reset_stats(link);
+ spin_unlock_bh(&le->lock);
+ tipc_node_read_unlock(node);
+ return 0;
+}
+
+/* Caller should hold node lock */
+static int __tipc_nl_add_node_links(struct net *net, struct tipc_nl_msg *msg,
+ struct tipc_node *node, u32 *prev_link)
+{
+ u32 i;
+ int err;
+
+ for (i = *prev_link; i < MAX_BEARERS; i++) {
+ *prev_link = i;
+
+ if (!node->links[i].link)
+ continue;
+
+ err = __tipc_nl_add_link(net, msg,
+ node->links[i].link, NLM_F_MULTI);
+ if (err)
+ return err;
+ }
+ *prev_link = 0;
+
+ return 0;
+}
+
+int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb)
+{
+ struct net *net = sock_net(skb->sk);
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_node *node;
+ struct tipc_nl_msg msg;
+ u32 prev_node = cb->args[0];
+ u32 prev_link = cb->args[1];
+ int done = cb->args[2];
+ int err;
+
+ if (done)
+ return 0;
+
+ msg.skb = skb;
+ msg.portid = NETLINK_CB(cb->skb).portid;
+ msg.seq = cb->nlh->nlmsg_seq;
+
+ rcu_read_lock();
+ if (prev_node) {
+ node = tipc_node_find(net, prev_node);
+ if (!node) {
+ /* We never set seq or call nl_dump_check_consistent()
+ * this means that setting prev_seq here will cause the
+ * consistence check to fail in the netlink callback
+ * handler. Resulting in the last NLMSG_DONE message
+ * having the NLM_F_DUMP_INTR flag set.
+ */
+ cb->prev_seq = 1;
+ goto out;
+ }
+ tipc_node_put(node);
+
+ list_for_each_entry_continue_rcu(node, &tn->node_list,
+ list) {
+ tipc_node_read_lock(node);
+ err = __tipc_nl_add_node_links(net, &msg, node,
+ &prev_link);
+ tipc_node_read_unlock(node);
+ if (err)
+ goto out;
+
+ prev_node = node->addr;
+ }
+ } else {
+ err = tipc_nl_add_bc_link(net, &msg);
+ if (err)
+ goto out;
+
+ list_for_each_entry_rcu(node, &tn->node_list, list) {
+ tipc_node_read_lock(node);
+ err = __tipc_nl_add_node_links(net, &msg, node,
+ &prev_link);
+ tipc_node_read_unlock(node);
+ if (err)
+ goto out;
+
+ prev_node = node->addr;
+ }
+ }
+ done = 1;
+out:
+ rcu_read_unlock();
+
+ cb->args[0] = prev_node;
+ cb->args[1] = prev_link;
+ cb->args[2] = done;
+
+ return skb->len;
+}
diff --git a/net/tipc/node.h b/net/tipc/node.h
index 6734562d3c6e..f39d9d06e8bb 100644
--- a/net/tipc/node.h
+++ b/net/tipc/node.h
@@ -42,23 +42,6 @@
#include "bearer.h"
#include "msg.h"
-/* Out-of-range value for node signature */
-#define INVALID_NODE_SIG 0x10000
-
-#define INVALID_BEARER_ID -1
-
-/* Flags used to take different actions according to flag type
- * TIPC_NOTIFY_NODE_DOWN: notify node is down
- * TIPC_NOTIFY_NODE_UP: notify node is up
- * TIPC_DISTRIBUTE_NAME: publish or withdraw link state name type
- */
-enum {
- TIPC_NOTIFY_NODE_DOWN = (1 << 3),
- TIPC_NOTIFY_NODE_UP = (1 << 4),
- TIPC_NOTIFY_LINK_UP = (1 << 6),
- TIPC_NOTIFY_LINK_DOWN = (1 << 7)
-};
-
/* Optional capabilities supported by this code version
*/
enum {
@@ -66,72 +49,8 @@ enum {
};
#define TIPC_NODE_CAPABILITIES TIPC_BCAST_SYNCH
+#define INVALID_BEARER_ID -1
-struct tipc_link_entry {
- struct tipc_link *link;
- u32 mtu;
- struct sk_buff_head inputq;
- struct tipc_media_addr maddr;
-};
-
-struct tipc_bclink_entry {
- struct tipc_link *link;
- struct sk_buff_head inputq1;
- struct sk_buff_head arrvq;
- struct sk_buff_head inputq2;
- struct sk_buff_head namedq;
-};
-
-/**
- * struct tipc_node - TIPC node structure
- * @addr: network address of node
- * @ref: reference counter to node object
- * @lock: spinlock governing access to structure
- * @net: the applicable net namespace
- * @hash: links to adjacent nodes in unsorted hash chain
- * @inputq: pointer to input queue containing messages for msg event
- * @namedq: pointer to name table input queue with name table messages
- * @active_links: bearer ids of active links, used as index into links[] array
- * @links: array containing references to all links to node
- * @action_flags: bit mask of different types of node actions
- * @state: connectivity state vs peer node
- * @sync_point: sequence number where synch/failover is finished
- * @list: links to adjacent nodes in sorted list of cluster's nodes
- * @working_links: number of working links to node (both active and standby)
- * @link_cnt: number of links to node
- * @capabilities: bitmap, indicating peer node's functional capabilities
- * @signature: node instance identifier
- * @link_id: local and remote bearer ids of changing link, if any
- * @publ_list: list of publications
- * @rcu: rcu struct for tipc_node
- */
-struct tipc_node {
- u32 addr;
- struct kref kref;
- spinlock_t lock;
- struct net *net;
- struct hlist_node hash;
- int active_links[2];
- struct tipc_link_entry links[MAX_BEARERS];
- struct tipc_bclink_entry bc_entry;
- int action_flags;
- struct list_head list;
- int state;
- u16 sync_point;
- int link_cnt;
- u16 working_links;
- u16 capabilities;
- u32 signature;
- u32 link_id;
- struct list_head publ_list;
- struct list_head conn_sks;
- unsigned long keepalive_intv;
- struct timer_list timer;
- struct rcu_head rcu;
-};
-
-struct tipc_node *tipc_node_find(struct net *net, u32 addr);
-void tipc_node_put(struct tipc_node *node);
void tipc_node_stop(struct net *net);
void tipc_node_check_dest(struct net *net, u32 onode,
struct tipc_bearer *bearer,
@@ -139,50 +58,22 @@ void tipc_node_check_dest(struct net *net, u32 onode,
struct tipc_media_addr *maddr,
bool *respond, bool *dupl_addr);
void tipc_node_delete_links(struct net *net, int bearer_id);
-void tipc_node_attach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
-void tipc_node_detach_link(struct tipc_node *n_ptr, struct tipc_link *l_ptr);
-bool tipc_node_is_up(struct tipc_node *n);
int tipc_node_get_linkname(struct net *net, u32 bearer_id, u32 node,
char *linkname, size_t len);
-void tipc_node_unlock(struct tipc_node *node);
int tipc_node_xmit(struct net *net, struct sk_buff_head *list, u32 dnode,
int selector);
int tipc_node_xmit_skb(struct net *net, struct sk_buff *skb, u32 dest,
u32 selector);
+void tipc_node_subscribe(struct net *net, struct list_head *subscr, u32 addr);
+void tipc_node_unsubscribe(struct net *net, struct list_head *subscr, u32 addr);
+void tipc_node_broadcast(struct net *net, struct sk_buff *skb);
int tipc_node_add_conn(struct net *net, u32 dnode, u32 port, u32 peer_port);
void tipc_node_remove_conn(struct net *net, u32 dnode, u32 port);
+int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel);
int tipc_nl_node_dump(struct sk_buff *skb, struct netlink_callback *cb);
-
-static inline void tipc_node_lock(struct tipc_node *node)
-{
- spin_lock_bh(&node->lock);
-}
-
-static inline struct tipc_link *node_active_link(struct tipc_node *n, int sel)
-{
- int bearer_id = n->active_links[sel & 1];
-
- if (unlikely(bearer_id == INVALID_BEARER_ID))
- return NULL;
-
- return n->links[bearer_id].link;
-}
-
-static inline unsigned int tipc_node_get_mtu(struct net *net, u32 addr, u32 sel)
-{
- struct tipc_node *n;
- int bearer_id;
- unsigned int mtu = MAX_MSG_SIZE;
-
- n = tipc_node_find(net, addr);
- if (unlikely(!n))
- return mtu;
-
- bearer_id = n->active_links[sel & 1];
- if (likely(bearer_id != INVALID_BEARER_ID))
- mtu = n->links[bearer_id].mtu;
- tipc_node_put(n);
- return mtu;
-}
+int tipc_nl_node_dump_link(struct sk_buff *skb, struct netlink_callback *cb);
+int tipc_nl_node_reset_link_stats(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_node_get_link(struct sk_buff *skb, struct genl_info *info);
+int tipc_nl_node_set_link(struct sk_buff *skb, struct genl_info *info);
#endif
diff --git a/net/tipc/server.c b/net/tipc/server.c
index 922e04a43396..2446bfbaa309 100644
--- a/net/tipc/server.c
+++ b/net/tipc/server.c
@@ -571,13 +571,13 @@ static void tipc_work_stop(struct tipc_server *s)
static int tipc_work_start(struct tipc_server *s)
{
- s->rcv_wq = alloc_workqueue("tipc_rcv", WQ_UNBOUND, 1);
+ s->rcv_wq = alloc_ordered_workqueue("tipc_rcv", 0);
if (!s->rcv_wq) {
pr_err("can't start tipc receive workqueue\n");
return -ENOMEM;
}
- s->send_wq = alloc_workqueue("tipc_send", WQ_UNBOUND, 1);
+ s->send_wq = alloc_ordered_workqueue("tipc_send", 0);
if (!s->send_wq) {
pr_err("can't start tipc send workqueue\n");
destroy_workqueue(s->rcv_wq);
diff --git a/net/tipc/socket.c b/net/tipc/socket.c
index 552dbaba9cf3..3eeb50a27b89 100644
--- a/net/tipc/socket.c
+++ b/net/tipc/socket.c
@@ -42,6 +42,7 @@
#include "name_distr.h"
#include "socket.h"
#include "bcast.h"
+#include "netlink.h"
#define SS_LISTENING -1 /* socket is listening */
#define SS_READY -2 /* socket is connectionless */
@@ -105,6 +106,7 @@ struct tipc_sock {
static int tipc_backlog_rcv(struct sock *sk, struct sk_buff *skb);
static void tipc_data_ready(struct sock *sk);
static void tipc_write_space(struct sock *sk);
+static void tipc_sock_destruct(struct sock *sk);
static int tipc_release(struct socket *sock);
static int tipc_accept(struct socket *sock, struct socket *new_sock, int flags);
static int tipc_wait_for_sndmsg(struct socket *sock, long *timeo_p);
@@ -125,14 +127,6 @@ static const struct proto_ops stream_ops;
static const struct proto_ops msg_ops;
static struct proto tipc_proto;
-static const struct nla_policy tipc_nl_sock_policy[TIPC_NLA_SOCK_MAX + 1] = {
- [TIPC_NLA_SOCK_UNSPEC] = { .type = NLA_UNSPEC },
- [TIPC_NLA_SOCK_ADDR] = { .type = NLA_U32 },
- [TIPC_NLA_SOCK_REF] = { .type = NLA_U32 },
- [TIPC_NLA_SOCK_CON] = { .type = NLA_NESTED },
- [TIPC_NLA_SOCK_HAS_PUBL] = { .type = NLA_FLAG }
-};
-
static const struct rhashtable_params tsk_rht_params;
/*
@@ -381,6 +375,7 @@ static int tipc_sk_create(struct net *net, struct socket *sock,
sk->sk_rcvbuf = sysctl_tipc_rmem[1];
sk->sk_data_ready = tipc_data_ready;
sk->sk_write_space = tipc_write_space;
+ sk->sk_destruct = tipc_sock_destruct;
tsk->conn_timeout = CONN_TIMEOUT_DEFAULT;
tsk->sent_unacked = 0;
atomic_set(&tsk->dupl_rcvcnt, 0);
@@ -470,9 +465,6 @@ static int tipc_release(struct socket *sock)
tipc_node_remove_conn(net, dnode, tsk->portid);
}
- /* Discard any remaining (connection-based) messages in receive queue */
- __skb_queue_purge(&sk->sk_receive_queue);
-
/* Reject any messages that accumulated in backlog queue */
sock->state = SS_DISCONNECTING;
release_sock(sk);
@@ -674,7 +666,7 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq,
struct tipc_sock *tsk = tipc_sk(sk);
struct net *net = sock_net(sk);
struct tipc_msg *mhdr = &tsk->phdr;
- struct sk_buff_head *pktchain = &sk->sk_write_queue;
+ struct sk_buff_head pktchain;
struct iov_iter save = msg->msg_iter;
uint mtu;
int rc;
@@ -688,14 +680,16 @@ static int tipc_sendmcast(struct socket *sock, struct tipc_name_seq *seq,
msg_set_nameupper(mhdr, seq->upper);
msg_set_hdr_sz(mhdr, MCAST_H_SIZE);
+ skb_queue_head_init(&pktchain);
+
new_mtu:
mtu = tipc_bcast_get_mtu(net);
- rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, pktchain);
+ rc = tipc_msg_build(mhdr, msg, 0, dsz, mtu, &pktchain);
if (unlikely(rc < 0))
return rc;
do {
- rc = tipc_bcast_xmit(net, pktchain);
+ rc = tipc_bcast_xmit(net, &pktchain);
if (likely(!rc))
return dsz;
@@ -705,7 +699,7 @@ new_mtu:
if (!rc)
continue;
}
- __skb_queue_purge(pktchain);
+ __skb_queue_purge(&pktchain);
if (rc == -EMSGSIZE) {
msg->msg_iter = save;
goto new_mtu;
@@ -864,7 +858,7 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
struct net *net = sock_net(sk);
struct tipc_msg *mhdr = &tsk->phdr;
u32 dnode, dport;
- struct sk_buff_head *pktchain = &sk->sk_write_queue;
+ struct sk_buff_head pktchain;
struct sk_buff *skb;
struct tipc_name_seq *seq;
struct iov_iter save;
@@ -925,17 +919,18 @@ static int __tipc_sendmsg(struct socket *sock, struct msghdr *m, size_t dsz)
msg_set_hdr_sz(mhdr, BASIC_H_SIZE);
}
+ skb_queue_head_init(&pktchain);
save = m->msg_iter;
new_mtu:
mtu = tipc_node_get_mtu(net, dnode, tsk->portid);
- rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, pktchain);
+ rc = tipc_msg_build(mhdr, m, 0, dsz, mtu, &pktchain);
if (rc < 0)
return rc;
do {
- skb = skb_peek(pktchain);
+ skb = skb_peek(&pktchain);
TIPC_SKB_CB(skb)->wakeup_pending = tsk->link_cong;
- rc = tipc_node_xmit(net, pktchain, dnode, tsk->portid);
+ rc = tipc_node_xmit(net, &pktchain, dnode, tsk->portid);
if (likely(!rc)) {
if (sock->state != SS_READY)
sock->state = SS_CONNECTING;
@@ -947,7 +942,7 @@ new_mtu:
if (!rc)
continue;
}
- __skb_queue_purge(pktchain);
+ __skb_queue_purge(&pktchain);
if (rc == -EMSGSIZE) {
m->msg_iter = save;
goto new_mtu;
@@ -1017,7 +1012,7 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
struct net *net = sock_net(sk);
struct tipc_sock *tsk = tipc_sk(sk);
struct tipc_msg *mhdr = &tsk->phdr;
- struct sk_buff_head *pktchain = &sk->sk_write_queue;
+ struct sk_buff_head pktchain;
DECLARE_SOCKADDR(struct sockaddr_tipc *, dest, m->msg_name);
u32 portid = tsk->portid;
int rc = -EINVAL;
@@ -1045,17 +1040,19 @@ static int __tipc_send_stream(struct socket *sock, struct msghdr *m, size_t dsz)
timeo = sock_sndtimeo(sk, m->msg_flags & MSG_DONTWAIT);
dnode = tsk_peer_node(tsk);
+ skb_queue_head_init(&pktchain);
next:
save = m->msg_iter;
mtu = tsk->max_pkt;
send = min_t(uint, dsz - sent, TIPC_MAX_USER_MSG_SIZE);
- rc = tipc_msg_build(mhdr, m, sent, send, mtu, pktchain);
+ rc = tipc_msg_build(mhdr, m, sent, send, mtu, &pktchain);
if (unlikely(rc < 0))
return rc;
+
do {
if (likely(!tsk_conn_cong(tsk))) {
- rc = tipc_node_xmit(net, pktchain, dnode, portid);
+ rc = tipc_node_xmit(net, &pktchain, dnode, portid);
if (likely(!rc)) {
tsk->sent_unacked++;
sent += send;
@@ -1064,7 +1061,7 @@ next:
goto next;
}
if (rc == -EMSGSIZE) {
- __skb_queue_purge(pktchain);
+ __skb_queue_purge(&pktchain);
tsk->max_pkt = tipc_node_get_mtu(net, dnode,
portid);
m->msg_iter = save;
@@ -1078,7 +1075,7 @@ next:
rc = tipc_wait_for_sndpkt(sock, &timeo);
} while (!rc);
- __skb_queue_purge(pktchain);
+ __skb_queue_purge(&pktchain);
return sent ? sent : rc;
}
@@ -1492,7 +1489,7 @@ static void tipc_write_space(struct sock *sk)
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, POLLOUT |
POLLWRNORM | POLLWRBAND);
rcu_read_unlock();
@@ -1509,12 +1506,17 @@ static void tipc_data_ready(struct sock *sk)
rcu_read_lock();
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait, POLLIN |
POLLRDNORM | POLLRDBAND);
rcu_read_unlock();
}
+static void tipc_sock_destruct(struct sock *sk)
+{
+ __skb_queue_purge(&sk->sk_receive_queue);
+}
+
/**
* filter_connect - Handle all incoming messages for a connection-based socket
* @tsk: TIPC socket
diff --git a/net/tipc/subscr.c b/net/tipc/subscr.c
index 350cca33ee0a..e6cb386fbf34 100644
--- a/net/tipc/subscr.c
+++ b/net/tipc/subscr.c
@@ -92,25 +92,42 @@ static void tipc_subscrp_send_event(struct tipc_subscription *sub,
*
* Returns 1 if there is overlap, otherwise 0.
*/
-int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower,
+int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
u32 found_upper)
{
- if (found_lower < sub->seq.lower)
- found_lower = sub->seq.lower;
- if (found_upper > sub->seq.upper)
- found_upper = sub->seq.upper;
+ if (found_lower < seq->lower)
+ found_lower = seq->lower;
+ if (found_upper > seq->upper)
+ found_upper = seq->upper;
if (found_lower > found_upper)
return 0;
return 1;
}
+u32 tipc_subscrp_convert_seq_type(u32 type, int swap)
+{
+ return htohl(type, swap);
+}
+
+void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap,
+ struct tipc_name_seq *out)
+{
+ out->type = htohl(in->type, swap);
+ out->lower = htohl(in->lower, swap);
+ out->upper = htohl(in->upper, swap);
+}
+
void tipc_subscrp_report_overlap(struct tipc_subscription *sub, u32 found_lower,
u32 found_upper, u32 event, u32 port_ref,
u32 node, int must)
{
- if (!tipc_subscrp_check_overlap(sub, found_lower, found_upper))
+ struct tipc_name_seq seq;
+
+ tipc_subscrp_convert_seq(&sub->evt.s.seq, sub->swap, &seq);
+ if (!tipc_subscrp_check_overlap(&seq, found_lower, found_upper))
return;
- if (!must && !(sub->filter & TIPC_SUB_PORTS))
+ if (!must &&
+ !(htohl(sub->evt.s.filter, sub->swap) & TIPC_SUB_PORTS))
return;
tipc_subscrp_send_event(sub, found_lower, found_upper, event, port_ref,
@@ -171,12 +188,14 @@ static struct tipc_subscriber *tipc_subscrb_create(int conid)
static void tipc_subscrb_delete(struct tipc_subscriber *subscriber)
{
struct tipc_subscription *sub, *temp;
+ u32 timeout;
spin_lock_bh(&subscriber->lock);
/* Destroy any existing subscriptions for subscriber */
list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list,
subscrp_list) {
- if (del_timer(&sub->timer)) {
+ timeout = htohl(sub->evt.s.timeout, sub->swap);
+ if ((timeout == TIPC_WAIT_FOREVER) || del_timer(&sub->timer)) {
tipc_subscrp_delete(sub);
tipc_subscrb_put(subscriber);
}
@@ -200,13 +219,16 @@ static void tipc_subscrp_cancel(struct tipc_subscr *s,
struct tipc_subscriber *subscriber)
{
struct tipc_subscription *sub, *temp;
+ u32 timeout;
spin_lock_bh(&subscriber->lock);
/* Find first matching subscription, exit if not found */
list_for_each_entry_safe(sub, temp, &subscriber->subscrp_list,
subscrp_list) {
if (!memcmp(s, &sub->evt.s, sizeof(struct tipc_subscr))) {
- if (del_timer(&sub->timer)) {
+ timeout = htohl(sub->evt.s.timeout, sub->swap);
+ if ((timeout == TIPC_WAIT_FOREVER) ||
+ del_timer(&sub->timer)) {
tipc_subscrp_delete(sub);
tipc_subscrb_put(subscriber);
}
@@ -216,66 +238,67 @@ static void tipc_subscrp_cancel(struct tipc_subscr *s,
spin_unlock_bh(&subscriber->lock);
}
-static int tipc_subscrp_create(struct net *net, struct tipc_subscr *s,
- struct tipc_subscriber *subscriber,
- struct tipc_subscription **sub_p)
+static struct tipc_subscription *tipc_subscrp_create(struct net *net,
+ struct tipc_subscr *s,
+ int swap)
{
struct tipc_net *tn = net_generic(net, tipc_net_id);
struct tipc_subscription *sub;
- int swap;
-
- /* Determine subscriber's endianness */
- swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE));
-
- /* Detect & process a subscription cancellation request */
- if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
- s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
- tipc_subscrp_cancel(s, subscriber);
- return 0;
- }
+ u32 filter = htohl(s->filter, swap);
/* Refuse subscription if global limit exceeded */
if (atomic_read(&tn->subscription_count) >= TIPC_MAX_SUBSCRIPTIONS) {
pr_warn("Subscription rejected, limit reached (%u)\n",
TIPC_MAX_SUBSCRIPTIONS);
- return -EINVAL;
+ return NULL;
}
/* Allocate subscription object */
sub = kmalloc(sizeof(*sub), GFP_ATOMIC);
if (!sub) {
pr_warn("Subscription rejected, no memory\n");
- return -ENOMEM;
+ return NULL;
}
/* Initialize subscription object */
sub->net = net;
- sub->seq.type = htohl(s->seq.type, swap);
- sub->seq.lower = htohl(s->seq.lower, swap);
- sub->seq.upper = htohl(s->seq.upper, swap);
- sub->timeout = msecs_to_jiffies(htohl(s->timeout, swap));
- sub->filter = htohl(s->filter, swap);
- if ((!(sub->filter & TIPC_SUB_PORTS) ==
- !(sub->filter & TIPC_SUB_SERVICE)) ||
- (sub->seq.lower > sub->seq.upper)) {
+ if (((filter & TIPC_SUB_PORTS) && (filter & TIPC_SUB_SERVICE)) ||
+ (htohl(s->seq.lower, swap) > htohl(s->seq.upper, swap))) {
pr_warn("Subscription rejected, illegal request\n");
kfree(sub);
- return -EINVAL;
+ return NULL;
}
- spin_lock_bh(&subscriber->lock);
- list_add(&sub->subscrp_list, &subscriber->subscrp_list);
- spin_unlock_bh(&subscriber->lock);
- sub->subscriber = subscriber;
+
sub->swap = swap;
memcpy(&sub->evt.s, s, sizeof(*s));
atomic_inc(&tn->subscription_count);
+ return sub;
+}
+
+static void tipc_subscrp_subscribe(struct net *net, struct tipc_subscr *s,
+ struct tipc_subscriber *subscriber, int swap)
+{
+ struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_subscription *sub = NULL;
+ u32 timeout;
+
+ sub = tipc_subscrp_create(net, s, swap);
+ if (!sub)
+ return tipc_conn_terminate(tn->topsrv, subscriber->conid);
+
+ spin_lock_bh(&subscriber->lock);
+ list_add(&sub->subscrp_list, &subscriber->subscrp_list);
+ tipc_subscrb_get(subscriber);
+ sub->subscriber = subscriber;
+ tipc_nametbl_subscribe(sub);
+ spin_unlock_bh(&subscriber->lock);
+
+ timeout = htohl(sub->evt.s.timeout, swap);
+ if (timeout == TIPC_WAIT_FOREVER)
+ return;
+
setup_timer(&sub->timer, tipc_subscrp_timeout, (unsigned long)sub);
- if (sub->timeout != TIPC_WAIT_FOREVER)
- sub->timeout += jiffies;
- if (!mod_timer(&sub->timer, sub->timeout))
- tipc_subscrb_get(subscriber);
- *sub_p = sub;
- return 0;
+ mod_timer(&sub->timer, jiffies + msecs_to_jiffies(timeout));
}
/* Handle one termination request for the subscriber */
@@ -290,14 +313,21 @@ static void tipc_subscrb_rcv_cb(struct net *net, int conid,
void *buf, size_t len)
{
struct tipc_subscriber *subscriber = usr_data;
- struct tipc_subscription *sub = NULL;
- struct tipc_net *tn = net_generic(net, tipc_net_id);
+ struct tipc_subscr *s = (struct tipc_subscr *)buf;
+ int swap;
+
+ /* Determine subscriber's endianness */
+ swap = !(s->filter & (TIPC_SUB_PORTS | TIPC_SUB_SERVICE |
+ TIPC_SUB_CANCEL));
+
+ /* Detect & process a subscription cancellation request */
+ if (s->filter & htohl(TIPC_SUB_CANCEL, swap)) {
+ s->filter &= ~htohl(TIPC_SUB_CANCEL, swap);
+ return tipc_subscrp_cancel(s, subscriber);
+ }
- tipc_subscrp_create(net, (struct tipc_subscr *)buf, subscriber, &sub);
- if (sub)
- tipc_nametbl_subscribe(sub);
- else
- tipc_conn_terminate(tn->topsrv, subscriber->conid);
+ if (s)
+ tipc_subscrp_subscribe(net, s, subscriber, swap);
}
/* Handle one request to establish a new subscriber */
diff --git a/net/tipc/subscr.h b/net/tipc/subscr.h
index 92ee18cc5fe6..be60103082c9 100644
--- a/net/tipc/subscr.h
+++ b/net/tipc/subscr.h
@@ -50,21 +50,15 @@ struct tipc_subscriber;
* @subscriber: pointer to its subscriber
* @seq: name sequence associated with subscription
* @net: point to network namespace
- * @timeout: duration of subscription (in ms)
- * @filter: event filtering to be done for subscription
* @timer: timer governing subscription duration (optional)
* @nameseq_list: adjacent subscriptions in name sequence's subscription list
* @subscrp_list: adjacent subscriptions in subscriber's subscription list
- * @server_ref: object reference of server port associated with subscription
* @swap: indicates if subscriber uses opposite endianness in its messages
* @evt: template for events generated by subscription
*/
struct tipc_subscription {
struct tipc_subscriber *subscriber;
- struct tipc_name_seq seq;
struct net *net;
- unsigned long timeout;
- u32 filter;
struct timer_list timer;
struct list_head nameseq_list;
struct list_head subscrp_list;
@@ -72,11 +66,14 @@ struct tipc_subscription {
struct tipc_event evt;
};
-int tipc_subscrp_check_overlap(struct tipc_subscription *sub, u32 found_lower,
+int tipc_subscrp_check_overlap(struct tipc_name_seq *seq, u32 found_lower,
u32 found_upper);
void tipc_subscrp_report_overlap(struct tipc_subscription *sub,
u32 found_lower, u32 found_upper, u32 event,
u32 port_ref, u32 node, int must);
+void tipc_subscrp_convert_seq(struct tipc_name_seq *in, int swap,
+ struct tipc_name_seq *out);
+u32 tipc_subscrp_convert_seq_type(u32 type, int swap);
int tipc_topsrv_start(struct net *net);
void tipc_topsrv_stop(struct net *net);
diff --git a/net/tipc/udp_media.c b/net/tipc/udp_media.c
index ad2719ad4c1b..c9cf2be3674a 100644
--- a/net/tipc/udp_media.c
+++ b/net/tipc/udp_media.c
@@ -48,20 +48,12 @@
#include <linux/tipc_netlink.h>
#include "core.h"
#include "bearer.h"
-#include "msg.h"
+#include "netlink.h"
/* IANA assigned UDP port */
#define UDP_PORT_DEFAULT 6118
-#define UDP_MIN_HEADROOM 28
-
-static const struct nla_policy tipc_nl_udp_policy[TIPC_NLA_UDP_MAX + 1] = {
- [TIPC_NLA_UDP_UNSPEC] = {.type = NLA_UNSPEC},
- [TIPC_NLA_UDP_LOCAL] = {.type = NLA_BINARY,
- .len = sizeof(struct sockaddr_storage)},
- [TIPC_NLA_UDP_REMOTE] = {.type = NLA_BINARY,
- .len = sizeof(struct sockaddr_storage)},
-};
+#define UDP_MIN_HEADROOM 48
/**
* struct udp_media_addr - IP/UDP addressing information
@@ -158,8 +150,11 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
struct udp_media_addr *src = (struct udp_media_addr *)&b->addr.value;
struct rtable *rt;
- if (skb_headroom(skb) < UDP_MIN_HEADROOM)
- pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC);
+ if (skb_headroom(skb) < UDP_MIN_HEADROOM) {
+ err = pskb_expand_head(skb, UDP_MIN_HEADROOM, 0, GFP_ATOMIC);
+ if (err)
+ goto tx_error;
+ }
skb_set_inner_protocol(skb, htons(ETH_P_TIPC));
ub = rcu_dereference_rtnl(b->media_ptr);
@@ -179,16 +174,12 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
err = PTR_ERR(rt);
goto tx_error;
}
+
+ skb->dev = rt->dst.dev;
ttl = ip4_dst_hoplimit(&rt->dst);
- err = udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb,
- src->ipv4.s_addr,
- dst->ipv4.s_addr, 0, ttl, 0,
- src->udp_port, dst->udp_port,
- false, true);
- if (err < 0) {
- ip_rt_put(rt);
- goto tx_error;
- }
+ udp_tunnel_xmit_skb(rt, ub->ubsock->sk, skb, src->ipv4.s_addr,
+ dst->ipv4.s_addr, 0, ttl, 0, src->udp_port,
+ dst->udp_port, false, true);
#if IS_ENABLED(CONFIG_IPV6)
} else {
struct dst_entry *ndst;
@@ -205,7 +196,7 @@ static int tipc_udp_send_msg(struct net *net, struct sk_buff *skb,
ttl = ip6_dst_hoplimit(ndst);
err = udp_tunnel6_xmit_skb(ndst, ub->ubsock->sk, skb,
ndst->dev, &src->ipv6,
- &dst->ipv6, 0, ttl, src->udp_port,
+ &dst->ipv6, 0, ttl, 0, src->udp_port,
dst->udp_port, false);
#endif
}
@@ -221,10 +212,6 @@ static int tipc_udp_recv(struct sock *sk, struct sk_buff *skb)
{
struct udp_bearer *ub;
struct tipc_bearer *b;
- int usr = msg_user(buf_msg(skb));
-
- if ((usr == LINK_PROTOCOL) || (usr == NAME_DISTRIBUTOR))
- skb_linearize(skb);
ub = rcu_dereference_sk_user_data(sk);
if (!ub) {
@@ -282,7 +269,7 @@ static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub,
struct udp_media_addr *remote)
{
struct nlattr *opts[TIPC_NLA_UDP_MAX + 1];
- struct sockaddr_storage *sa_local, *sa_remote;
+ struct sockaddr_storage sa_local, sa_remote;
if (!attrs[TIPC_NLA_BEARER_UDP_OPTS])
goto err;
@@ -291,41 +278,48 @@ static int parse_options(struct nlattr *attrs[], struct udp_bearer *ub,
tipc_nl_udp_policy))
goto err;
if (opts[TIPC_NLA_UDP_LOCAL] && opts[TIPC_NLA_UDP_REMOTE]) {
- sa_local = nla_data(opts[TIPC_NLA_UDP_LOCAL]);
- sa_remote = nla_data(opts[TIPC_NLA_UDP_REMOTE]);
+ nla_memcpy(&sa_local, opts[TIPC_NLA_UDP_LOCAL],
+ sizeof(sa_local));
+ nla_memcpy(&sa_remote, opts[TIPC_NLA_UDP_REMOTE],
+ sizeof(sa_remote));
} else {
err:
pr_err("Invalid UDP bearer configuration");
return -EINVAL;
}
- if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET) {
+ if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET) {
struct sockaddr_in *ip4;
- ip4 = (struct sockaddr_in *)sa_local;
+ ip4 = (struct sockaddr_in *)&sa_local;
local->proto = htons(ETH_P_IP);
local->udp_port = ip4->sin_port;
local->ipv4.s_addr = ip4->sin_addr.s_addr;
- ip4 = (struct sockaddr_in *)sa_remote;
+ ip4 = (struct sockaddr_in *)&sa_remote;
remote->proto = htons(ETH_P_IP);
remote->udp_port = ip4->sin_port;
remote->ipv4.s_addr = ip4->sin_addr.s_addr;
return 0;
#if IS_ENABLED(CONFIG_IPV6)
- } else if ((sa_local->ss_family & sa_remote->ss_family) == AF_INET6) {
+ } else if ((sa_local.ss_family & sa_remote.ss_family) == AF_INET6) {
+ int atype;
struct sockaddr_in6 *ip6;
- ip6 = (struct sockaddr_in6 *)sa_local;
+ ip6 = (struct sockaddr_in6 *)&sa_local;
+ atype = ipv6_addr_type(&ip6->sin6_addr);
+ if (__ipv6_addr_needs_scope_id(atype) && !ip6->sin6_scope_id)
+ return -EINVAL;
+
local->proto = htons(ETH_P_IPV6);
local->udp_port = ip6->sin6_port;
- local->ipv6 = ip6->sin6_addr;
+ memcpy(&local->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
ub->ifindex = ip6->sin6_scope_id;
- ip6 = (struct sockaddr_in6 *)sa_remote;
+ ip6 = (struct sockaddr_in6 *)&sa_remote;
remote->proto = htons(ETH_P_IPV6);
remote->udp_port = ip6->sin6_port;
- remote->ipv6 = ip6->sin6_addr;
+ memcpy(&remote->ipv6, &ip6->sin6_addr, sizeof(struct in6_addr));
return 0;
#endif
}
diff --git a/net/unix/af_unix.c b/net/unix/af_unix.c
index aaa0b58d6aba..8269da73e9e5 100644
--- a/net/unix/af_unix.c
+++ b/net/unix/af_unix.c
@@ -326,6 +326,118 @@ found:
return s;
}
+/* Support code for asymmetrically connected dgram sockets
+ *
+ * If a datagram socket is connected to a socket not itself connected
+ * to the first socket (eg, /dev/log), clients may only enqueue more
+ * messages if the present receive queue of the server socket is not
+ * "too large". This means there's a second writeability condition
+ * poll and sendmsg need to test. The dgram recv code will do a wake
+ * up on the peer_wait wait queue of a socket upon reception of a
+ * datagram which needs to be propagated to sleeping would-be writers
+ * since these might not have sent anything so far. This can't be
+ * accomplished via poll_wait because the lifetime of the server
+ * socket might be less than that of its clients if these break their
+ * association with it or if the server socket is closed while clients
+ * are still connected to it and there's no way to inform "a polling
+ * implementation" that it should let go of a certain wait queue
+ *
+ * In order to propagate a wake up, a wait_queue_t of the client
+ * socket is enqueued on the peer_wait queue of the server socket
+ * whose wake function does a wake_up on the ordinary client socket
+ * wait queue. This connection is established whenever a write (or
+ * poll for write) hit the flow control condition and broken when the
+ * association to the server socket is dissolved or after a wake up
+ * was relayed.
+ */
+
+static int unix_dgram_peer_wake_relay(wait_queue_t *q, unsigned mode, int flags,
+ void *key)
+{
+ struct unix_sock *u;
+ wait_queue_head_t *u_sleep;
+
+ u = container_of(q, struct unix_sock, peer_wake);
+
+ __remove_wait_queue(&unix_sk(u->peer_wake.private)->peer_wait,
+ q);
+ u->peer_wake.private = NULL;
+
+ /* relaying can only happen while the wq still exists */
+ u_sleep = sk_sleep(&u->sk);
+ if (u_sleep)
+ wake_up_interruptible_poll(u_sleep, key);
+
+ return 0;
+}
+
+static int unix_dgram_peer_wake_connect(struct sock *sk, struct sock *other)
+{
+ struct unix_sock *u, *u_other;
+ int rc;
+
+ u = unix_sk(sk);
+ u_other = unix_sk(other);
+ rc = 0;
+ spin_lock(&u_other->peer_wait.lock);
+
+ if (!u->peer_wake.private) {
+ u->peer_wake.private = other;
+ __add_wait_queue(&u_other->peer_wait, &u->peer_wake);
+
+ rc = 1;
+ }
+
+ spin_unlock(&u_other->peer_wait.lock);
+ return rc;
+}
+
+static void unix_dgram_peer_wake_disconnect(struct sock *sk,
+ struct sock *other)
+{
+ struct unix_sock *u, *u_other;
+
+ u = unix_sk(sk);
+ u_other = unix_sk(other);
+ spin_lock(&u_other->peer_wait.lock);
+
+ if (u->peer_wake.private == other) {
+ __remove_wait_queue(&u_other->peer_wait, &u->peer_wake);
+ u->peer_wake.private = NULL;
+ }
+
+ spin_unlock(&u_other->peer_wait.lock);
+}
+
+static void unix_dgram_peer_wake_disconnect_wakeup(struct sock *sk,
+ struct sock *other)
+{
+ unix_dgram_peer_wake_disconnect(sk, other);
+ wake_up_interruptible_poll(sk_sleep(sk),
+ POLLOUT |
+ POLLWRNORM |
+ POLLWRBAND);
+}
+
+/* preconditions:
+ * - unix_peer(sk) == other
+ * - association is stable
+ */
+static int unix_dgram_peer_wake_me(struct sock *sk, struct sock *other)
+{
+ int connected;
+
+ connected = unix_dgram_peer_wake_connect(sk, other);
+
+ if (unix_recvq_full(other))
+ return 1;
+
+ if (connected)
+ unix_dgram_peer_wake_disconnect(sk, other);
+
+ return 0;
+}
+
static int unix_writable(const struct sock *sk)
{
return sk->sk_state != TCP_LISTEN &&
@@ -339,7 +451,7 @@ static void unix_write_space(struct sock *sk)
rcu_read_lock();
if (unix_writable(sk)) {
wq = rcu_dereference(sk->sk_wq);
- if (wq_has_sleeper(wq))
+ if (skwq_has_sleeper(wq))
wake_up_interruptible_sync_poll(&wq->wait,
POLLOUT | POLLWRNORM | POLLWRBAND);
sk_wake_async(sk, SOCK_WAKE_SPACE, POLL_OUT);
@@ -431,6 +543,8 @@ static void unix_release_sock(struct sock *sk, int embrion)
skpair->sk_state_change(skpair);
sk_wake_async(skpair, SOCK_WAKE_WAITD, POLL_HUP);
}
+
+ unix_dgram_peer_wake_disconnect(sk, skpair);
sock_put(skpair); /* It may now die */
unix_peer(sk) = NULL;
}
@@ -441,6 +555,7 @@ static void unix_release_sock(struct sock *sk, int embrion)
if (state == TCP_LISTEN)
unix_release_sock(skb->sk, 1);
/* passed fds are erased in the kfree_skb hook */
+ UNIXCB(skb).consumed = skb->len;
kfree_skb(skb);
}
@@ -665,6 +780,7 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern)
INIT_LIST_HEAD(&u->link);
mutex_init(&u->readlock); /* single task reading lock */
init_waitqueue_head(&u->peer_wait);
+ init_waitqueue_func_entry(&u->peer_wake, unix_dgram_peer_wake_relay);
unix_insert_socket(unix_sockets_unbound(sk), sk);
out:
if (sk == NULL)
@@ -837,32 +953,20 @@ fail:
return NULL;
}
-static int unix_mknod(const char *sun_path, umode_t mode, struct path *res)
+static int unix_mknod(struct dentry *dentry, struct path *path, umode_t mode,
+ struct path *res)
{
- struct dentry *dentry;
- struct path path;
- int err = 0;
- /*
- * Get the parent directory, calculate the hash for last
- * component.
- */
- dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
- err = PTR_ERR(dentry);
- if (IS_ERR(dentry))
- return err;
+ int err;
- /*
- * All right, let's create it.
- */
- err = security_path_mknod(&path, dentry, mode, 0);
+ err = security_path_mknod(path, dentry, mode, 0);
if (!err) {
- err = vfs_mknod(d_inode(path.dentry), dentry, mode, 0);
+ err = vfs_mknod(d_inode(path->dentry), dentry, mode, 0);
if (!err) {
- res->mnt = mntget(path.mnt);
+ res->mnt = mntget(path->mnt);
res->dentry = dget(dentry);
}
}
- done_path_create(&path, dentry);
+
return err;
}
@@ -873,10 +977,12 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
struct unix_sock *u = unix_sk(sk);
struct sockaddr_un *sunaddr = (struct sockaddr_un *)uaddr;
char *sun_path = sunaddr->sun_path;
- int err;
+ int err, name_err;
unsigned int hash;
struct unix_address *addr;
struct hlist_head *list;
+ struct path path;
+ struct dentry *dentry;
err = -EINVAL;
if (sunaddr->sun_family != AF_UNIX)
@@ -892,14 +998,34 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out;
addr_len = err;
+ name_err = 0;
+ dentry = NULL;
+ if (sun_path[0]) {
+ /* Get the parent directory, calculate the hash for last
+ * component.
+ */
+ dentry = kern_path_create(AT_FDCWD, sun_path, &path, 0);
+
+ if (IS_ERR(dentry)) {
+ /* delay report until after 'already bound' check */
+ name_err = PTR_ERR(dentry);
+ dentry = NULL;
+ }
+ }
+
err = mutex_lock_interruptible(&u->readlock);
if (err)
- goto out;
+ goto out_path;
err = -EINVAL;
if (u->addr)
goto out_up;
+ if (name_err) {
+ err = name_err == -EEXIST ? -EADDRINUSE : name_err;
+ goto out_up;
+ }
+
err = -ENOMEM;
addr = kmalloc(sizeof(*addr)+addr_len, GFP_KERNEL);
if (!addr)
@@ -910,11 +1036,11 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
addr->hash = hash ^ sk->sk_type;
atomic_set(&addr->refcnt, 1);
- if (sun_path[0]) {
- struct path path;
+ if (dentry) {
+ struct path u_path;
umode_t mode = S_IFSOCK |
(SOCK_INODE(sock)->i_mode & ~current_umask());
- err = unix_mknod(sun_path, mode, &path);
+ err = unix_mknod(dentry, &path, mode, &u_path);
if (err) {
if (err == -EEXIST)
err = -EADDRINUSE;
@@ -922,9 +1048,9 @@ static int unix_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
goto out_up;
}
addr->hash = UNIX_HASH_SIZE;
- hash = d_backing_inode(path.dentry)->i_ino & (UNIX_HASH_SIZE-1);
+ hash = d_backing_inode(dentry)->i_ino & (UNIX_HASH_SIZE - 1);
spin_lock(&unix_table_lock);
- u->path = path;
+ u->path = u_path;
list = &unix_socket_table[hash];
} else {
spin_lock(&unix_table_lock);
@@ -947,6 +1073,10 @@ out_unlock:
spin_unlock(&unix_table_lock);
out_up:
mutex_unlock(&u->readlock);
+out_path:
+ if (dentry)
+ done_path_create(&path, dentry);
+
out:
return err;
}
@@ -1032,6 +1162,8 @@ restart:
if (unix_peer(sk)) {
struct sock *old_peer = unix_peer(sk);
unix_peer(sk) = other;
+ unix_dgram_peer_wake_disconnect_wakeup(sk, old_peer);
+
unix_state_double_unlock(sk, other);
if (other != old_peer)
@@ -1364,7 +1496,7 @@ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
UNIXCB(skb).fp = NULL;
for (i = scm->fp->count-1; i >= 0; i--)
- unix_notinflight(scm->fp->fp[i]);
+ unix_notinflight(scm->fp->user, scm->fp->fp[i]);
}
static void unix_destruct_scm(struct sk_buff *skb)
@@ -1381,22 +1513,37 @@ static void unix_destruct_scm(struct sk_buff *skb)
sock_wfree(skb);
}
+/*
+ * The "user->unix_inflight" variable is protected by the garbage
+ * collection lock, and we just read it locklessly here. If you go
+ * over the limit, there might be a tiny race in actually noticing
+ * it across threads. Tough.
+ */
+static inline bool too_many_unix_fds(struct task_struct *p)
+{
+ struct user_struct *user = current_user();
+
+ if (unlikely(user->unix_inflight > task_rlimit(p, RLIMIT_NOFILE)))
+ return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+ return false;
+}
+
#define MAX_RECURSION_LEVEL 4
static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
int i;
unsigned char max_level = 0;
- int unix_sock_count = 0;
+
+ if (too_many_unix_fds(current))
+ return -ETOOMANYREFS;
for (i = scm->fp->count - 1; i >= 0; i--) {
struct sock *sk = unix_get_socket(scm->fp->fp[i]);
- if (sk) {
- unix_sock_count++;
+ if (sk)
max_level = max(max_level,
unix_sk(sk)->recursion_level);
- }
}
if (unlikely(max_level > MAX_RECURSION_LEVEL))
return -ETOOMANYREFS;
@@ -1410,10 +1557,8 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
if (!UNIXCB(skb).fp)
return -ENOMEM;
- if (unix_sock_count) {
- for (i = scm->fp->count - 1; i >= 0; i--)
- unix_inflight(scm->fp->fp[i]);
- }
+ for (i = scm->fp->count - 1; i >= 0; i--)
+ unix_inflight(scm->fp->user, scm->fp->fp[i]);
return max_level;
}
@@ -1433,6 +1578,14 @@ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool sen
return err;
}
+static bool unix_passcred_enabled(const struct socket *sock,
+ const struct sock *other)
+{
+ return test_bit(SOCK_PASSCRED, &sock->flags) ||
+ !other->sk_socket ||
+ test_bit(SOCK_PASSCRED, &other->sk_socket->flags);
+}
+
/*
* Some apps rely on write() giving SCM_CREDENTIALS
* We include credentials if source or destination socket
@@ -1443,14 +1596,41 @@ static void maybe_add_creds(struct sk_buff *skb, const struct socket *sock,
{
if (UNIXCB(skb).pid)
return;
- if (test_bit(SOCK_PASSCRED, &sock->flags) ||
- !other->sk_socket ||
- test_bit(SOCK_PASSCRED, &other->sk_socket->flags)) {
+ if (unix_passcred_enabled(sock, other)) {
UNIXCB(skb).pid = get_pid(task_tgid(current));
current_uid_gid(&UNIXCB(skb).uid, &UNIXCB(skb).gid);
}
}
+static int maybe_init_creds(struct scm_cookie *scm,
+ struct socket *socket,
+ const struct sock *other)
+{
+ int err;
+ struct msghdr msg = { .msg_controllen = 0 };
+
+ err = scm_send(socket, &msg, scm, false);
+ if (err)
+ return err;
+
+ if (unix_passcred_enabled(socket, other)) {
+ scm->pid = get_pid(task_tgid(current));
+ current_uid_gid(&scm->creds.uid, &scm->creds.gid);
+ }
+ return err;
+}
+
+static bool unix_skb_scm_eq(struct sk_buff *skb,
+ struct scm_cookie *scm)
+{
+ const struct unix_skb_parms *u = &UNIXCB(skb);
+
+ return u->pid == scm->pid &&
+ uid_eq(u->uid, scm->creds.uid) &&
+ gid_eq(u->gid, scm->creds.gid) &&
+ unix_secdata_eq(scm, skb);
+}
+
/*
* Send AF_UNIX data.
*/
@@ -1471,6 +1651,7 @@ static int unix_dgram_sendmsg(struct socket *sock, struct msghdr *msg,
struct scm_cookie scm;
int max_level;
int data_len = 0;
+ int sk_locked;
wait_for_unix_gc();
err = scm_send(sock, msg, &scm, false);
@@ -1549,12 +1730,14 @@ restart:
goto out_free;
}
+ sk_locked = 0;
unix_state_lock(other);
+restart_locked:
err = -EPERM;
if (!unix_may_send(sk, other))
goto out_unlock;
- if (sock_flag(other, SOCK_DEAD)) {
+ if (unlikely(sock_flag(other, SOCK_DEAD))) {
/*
* Check with 1003.1g - what should
* datagram error
@@ -1562,10 +1745,14 @@ restart:
unix_state_unlock(other);
sock_put(other);
+ if (!sk_locked)
+ unix_state_lock(sk);
+
err = 0;
- unix_state_lock(sk);
if (unix_peer(sk) == other) {
unix_peer(sk) = NULL;
+ unix_dgram_peer_wake_disconnect_wakeup(sk, other);
+
unix_state_unlock(sk);
unix_dgram_disconnected(sk, other);
@@ -1591,21 +1778,43 @@ restart:
goto out_unlock;
}
- if (unix_peer(other) != sk && unix_recvq_full(other)) {
- if (!timeo) {
- err = -EAGAIN;
- goto out_unlock;
+ /* other == sk && unix_peer(other) != sk if
+ * - unix_peer(sk) == NULL, destination address bound to sk
+ * - unix_peer(sk) == sk by time of get but disconnected before lock
+ */
+ if (other != sk &&
+ unlikely(unix_peer(other) != sk && unix_recvq_full(other))) {
+ if (timeo) {
+ timeo = unix_wait_for_peer(other, timeo);
+
+ err = sock_intr_errno(timeo);
+ if (signal_pending(current))
+ goto out_free;
+
+ goto restart;
}
- timeo = unix_wait_for_peer(other, timeo);
+ if (!sk_locked) {
+ unix_state_unlock(other);
+ unix_state_double_lock(sk, other);
+ }
- err = sock_intr_errno(timeo);
- if (signal_pending(current))
- goto out_free;
+ if (unix_peer(sk) != other ||
+ unix_dgram_peer_wake_me(sk, other)) {
+ err = -EAGAIN;
+ sk_locked = 1;
+ goto out_unlock;
+ }
- goto restart;
+ if (!sk_locked) {
+ sk_locked = 1;
+ goto restart_locked;
+ }
}
+ if (unlikely(sk_locked))
+ unix_state_unlock(sk);
+
if (sock_flag(other, SOCK_RCVTSTAMP))
__net_timestamp(skb);
maybe_add_creds(skb, sock, other);
@@ -1619,6 +1828,8 @@ restart:
return len;
out_unlock:
+ if (sk_locked)
+ unix_state_unlock(sk);
unix_state_unlock(other);
out_free:
kfree_skb(skb);
@@ -1740,8 +1951,10 @@ out_err:
static ssize_t unix_stream_sendpage(struct socket *socket, struct page *page,
int offset, size_t size, int flags)
{
- int err = 0;
- bool send_sigpipe = true;
+ int err;
+ bool send_sigpipe = false;
+ bool init_scm = true;
+ struct scm_cookie scm;
struct sock *other, *sk = socket->sk;
struct sk_buff *skb, *newskb = NULL, *tail = NULL;
@@ -1759,7 +1972,7 @@ alloc_skb:
newskb = sock_alloc_send_pskb(sk, 0, 0, flags & MSG_DONTWAIT,
&err, 0);
if (!newskb)
- return err;
+ goto err;
}
/* we must acquire readlock as we modify already present
@@ -1768,12 +1981,12 @@ alloc_skb:
err = mutex_lock_interruptible(&unix_sk(other)->readlock);
if (err) {
err = flags & MSG_DONTWAIT ? -EAGAIN : -ERESTARTSYS;
- send_sigpipe = false;
goto err;
}
if (sk->sk_shutdown & SEND_SHUTDOWN) {
err = -EPIPE;
+ send_sigpipe = true;
goto err_unlock;
}
@@ -1782,23 +1995,34 @@ alloc_skb:
if (sock_flag(other, SOCK_DEAD) ||
other->sk_shutdown & RCV_SHUTDOWN) {
err = -EPIPE;
+ send_sigpipe = true;
goto err_state_unlock;
}
+ if (init_scm) {
+ err = maybe_init_creds(&scm, socket, other);
+ if (err)
+ goto err_state_unlock;
+ init_scm = false;
+ }
+
skb = skb_peek_tail(&other->sk_receive_queue);
if (tail && tail == skb) {
skb = newskb;
- } else if (!skb) {
- if (newskb)
+ } else if (!skb || !unix_skb_scm_eq(skb, &scm)) {
+ if (newskb) {
skb = newskb;
- else
+ } else {
+ tail = skb;
goto alloc_skb;
+ }
} else if (newskb) {
/* this is fast path, we don't necessarily need to
* call to kfree_skb even though with newskb == NULL
* this - does no harm
*/
consume_skb(newskb);
+ newskb = NULL;
}
if (skb_append_pagefrags(skb, page, offset, size)) {
@@ -1811,14 +2035,20 @@ alloc_skb:
skb->truesize += size;
atomic_add(size, &sk->sk_wmem_alloc);
- if (newskb)
+ if (newskb) {
+ err = unix_scm_to_skb(&scm, skb, false);
+ if (err)
+ goto err_state_unlock;
+ spin_lock(&other->sk_receive_queue.lock);
__skb_queue_tail(&other->sk_receive_queue, newskb);
+ spin_unlock(&other->sk_receive_queue.lock);
+ }
unix_state_unlock(other);
mutex_unlock(&unix_sk(other)->readlock);
other->sk_data_ready(other);
-
+ scm_destroy(&scm);
return size;
err_state_unlock:
@@ -1829,6 +2059,8 @@ err:
kfree_skb(newskb);
if (send_sigpipe && !(flags & MSG_NOSIGNAL))
send_sig(SIGPIPE, current, 0);
+ if (!init_scm)
+ scm_destroy(&scm);
return err;
}
@@ -1878,8 +2110,8 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
struct scm_cookie scm;
struct sock *sk = sock->sk;
struct unix_sock *u = unix_sk(sk);
- int noblock = flags & MSG_DONTWAIT;
- struct sk_buff *skb;
+ struct sk_buff *skb, *last;
+ long timeo;
int err;
int peeked, skip;
@@ -1887,30 +2119,38 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
if (flags&MSG_OOB)
goto out;
- err = mutex_lock_interruptible(&u->readlock);
- if (unlikely(err)) {
- /* recvmsg() in non blocking mode is supposed to return -EAGAIN
- * sk_rcvtimeo is not honored by mutex_lock_interruptible()
- */
- err = noblock ? -EAGAIN : -ERESTARTSYS;
- goto out;
- }
+ timeo = sock_rcvtimeo(sk, flags & MSG_DONTWAIT);
- skip = sk_peek_offset(sk, flags);
+ do {
+ mutex_lock(&u->readlock);
- skb = __skb_recv_datagram(sk, flags, &peeked, &skip, &err);
- if (!skb) {
+ skip = sk_peek_offset(sk, flags);
+ skb = __skb_try_recv_datagram(sk, flags, &peeked, &skip, &err,
+ &last);
+ if (skb)
+ break;
+
+ mutex_unlock(&u->readlock);
+
+ if (err != -EAGAIN)
+ break;
+ } while (timeo &&
+ !__skb_wait_for_more_packets(sk, &err, &timeo, last));
+
+ if (!skb) { /* implies readlock unlocked */
unix_state_lock(sk);
/* Signal EOF on disconnected non-blocking SEQPACKET socket. */
if (sk->sk_type == SOCK_SEQPACKET && err == -EAGAIN &&
(sk->sk_shutdown & RCV_SHUTDOWN))
err = 0;
unix_state_unlock(sk);
- goto out_unlock;
+ goto out;
}
- wake_up_interruptible_sync_poll(&u->peer_wait,
- POLLOUT | POLLWRNORM | POLLWRBAND);
+ if (wq_has_sleeper(&u->peer_wait))
+ wake_up_interruptible_sync_poll(&u->peer_wait,
+ POLLOUT | POLLWRNORM |
+ POLLWRBAND);
if (msg->msg_name)
unix_copy_addr(msg, skb->sk);
@@ -1962,7 +2202,6 @@ static int unix_dgram_recvmsg(struct socket *sock, struct msghdr *msg,
out_free:
skb_free_datagram(sk, skb);
-out_unlock:
mutex_unlock(&u->readlock);
out:
return err;
@@ -1991,7 +2230,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
!timeo)
break;
- set_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_WAITDATA, sk);
unix_state_unlock(sk);
timeo = freezable_schedule_timeout(timeo);
unix_state_lock(sk);
@@ -1999,7 +2238,7 @@ static long unix_stream_data_wait(struct sock *sk, long timeo,
if (sock_flag(sk, SOCK_DEAD))
break;
- clear_bit(SOCK_ASYNC_WAITDATA, &sk->sk_socket->flags);
+ sk_clear_bit(SOCKWQ_ASYNC_WAITDATA, sk);
}
finish_wait(sk_sleep(sk), &wait);
@@ -2040,13 +2279,15 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state)
size_t size = state->size;
unsigned int last_len;
- err = -EINVAL;
- if (sk->sk_state != TCP_ESTABLISHED)
+ if (unlikely(sk->sk_state != TCP_ESTABLISHED)) {
+ err = -EINVAL;
goto out;
+ }
- err = -EOPNOTSUPP;
- if (flags & MSG_OOB)
+ if (unlikely(flags & MSG_OOB)) {
+ err = -EOPNOTSUPP;
goto out;
+ }
target = sock_rcvlowat(sk, flags & MSG_WAITALL, size);
timeo = sock_rcvtimeo(sk, noblock);
@@ -2056,14 +2297,7 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state)
/* Lock the socket to prevent queue disordering
* while sleeps in memcpy_tomsg
*/
- err = mutex_lock_interruptible(&u->readlock);
- if (unlikely(err)) {
- /* recvmsg() in non blocking mode is supposed to return -EAGAIN
- * sk_rcvtimeo is not honored by mutex_lock_interruptible()
- */
- err = noblock ? -EAGAIN : -ERESTARTSYS;
- goto out;
- }
+ mutex_lock(&u->readlock);
if (flags & MSG_PEEK)
skip = sk_peek_offset(sk, flags);
@@ -2072,8 +2306,10 @@ static int unix_stream_read_generic(struct unix_stream_read_state *state)
do {
int chunk;
+ bool drop_skb;
struct sk_buff *skb, *last;
+redo:
unix_state_lock(sk);
if (sock_flag(sk, SOCK_DEAD)) {
err = -ECONNRESET;
@@ -2098,21 +2334,24 @@ again:
goto unlock;
unix_state_unlock(sk);
- err = -EAGAIN;
- if (!timeo)
+ if (!timeo) {
+ err = -EAGAIN;
break;
+ }
+
mutex_unlock(&u->readlock);
timeo = unix_stream_data_wait(sk, timeo, last,
last_len);
- if (signal_pending(current) ||
- mutex_lock_interruptible(&u->readlock)) {
+ if (signal_pending(current)) {
err = sock_intr_errno(timeo);
+ scm_destroy(&scm);
goto out;
}
- continue;
+ mutex_lock(&u->readlock);
+ goto redo;
unlock:
unix_state_unlock(sk);
break;
@@ -2131,10 +2370,7 @@ unlock:
if (check_creds) {
/* Never glue messages from different writers */
- if ((UNIXCB(skb).pid != scm.pid) ||
- !uid_eq(UNIXCB(skb).uid, scm.creds.uid) ||
- !gid_eq(UNIXCB(skb).gid, scm.creds.gid) ||
- !unix_secdata_eq(&scm, skb))
+ if (!unix_skb_scm_eq(skb, &scm))
break;
} else if (test_bit(SOCK_PASSCRED, &sock->flags)) {
/* Copy credentials */
@@ -2152,7 +2388,11 @@ unlock:
}
chunk = min_t(unsigned int, unix_skb_len(skb) - skip, size);
+ skb_get(skb);
chunk = state->recv_actor(skb, skip, chunk, state);
+ drop_skb = !unix_skb_len(skb);
+ /* skb is only safe to use if !drop_skb */
+ consume_skb(skb);
if (chunk < 0) {
if (copied == 0)
copied = -EFAULT;
@@ -2161,6 +2401,18 @@ unlock:
copied += chunk;
size -= chunk;
+ if (drop_skb) {
+ /* the skb was touched by a concurrent reader;
+ * we should not expect anything from this skb
+ * anymore and assume it invalid - we can be
+ * sure it was dropped from the socket queue
+ *
+ * let's report a short read
+ */
+ err = 0;
+ break;
+ }
+
/* Mark read part of skb as used */
if (!(flags & MSG_PEEK)) {
UNIXCB(skb).consumed += chunk;
@@ -2454,20 +2706,22 @@ static unsigned int unix_dgram_poll(struct file *file, struct socket *sock,
return mask;
writable = unix_writable(sk);
- other = unix_peer_get(sk);
- if (other) {
- if (unix_peer(other) != sk) {
- sock_poll_wait(file, &unix_sk(other)->peer_wait, wait);
- if (unix_recvq_full(other))
- writable = 0;
- }
- sock_put(other);
+ if (writable) {
+ unix_state_lock(sk);
+
+ other = unix_peer(sk);
+ if (other && unix_peer(other) != sk &&
+ unix_recvq_full(other) &&
+ unix_dgram_peer_wake_me(sk, other))
+ writable = 0;
+
+ unix_state_unlock(sk);
}
if (writable)
mask |= POLLOUT | POLLWRNORM | POLLWRBAND;
else
- set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags);
+ sk_set_bit(SOCKWQ_ASYNC_NOSPACE, sk);
return mask;
}
diff --git a/net/unix/diag.c b/net/unix/diag.c
index c512f64d5287..4d9679701a6d 100644
--- a/net/unix/diag.c
+++ b/net/unix/diag.c
@@ -220,7 +220,7 @@ done:
return skb->len;
}
-static struct sock *unix_lookup_by_ino(int ino)
+static struct sock *unix_lookup_by_ino(unsigned int ino)
{
int i;
struct sock *sk;
diff --git a/net/unix/garbage.c b/net/unix/garbage.c
index a73a226f2d33..6a0d48525fcf 100644
--- a/net/unix/garbage.c
+++ b/net/unix/garbage.c
@@ -116,15 +116,15 @@ struct sock *unix_get_socket(struct file *filp)
* descriptor if it is for an AF_UNIX socket.
*/
-void unix_inflight(struct file *fp)
+void unix_inflight(struct user_struct *user, struct file *fp)
{
struct sock *s = unix_get_socket(fp);
+ spin_lock(&unix_gc_lock);
+
if (s) {
struct unix_sock *u = unix_sk(s);
- spin_lock(&unix_gc_lock);
-
if (atomic_long_inc_return(&u->inflight) == 1) {
BUG_ON(!list_empty(&u->link));
list_add_tail(&u->link, &gc_inflight_list);
@@ -132,25 +132,28 @@ void unix_inflight(struct file *fp)
BUG_ON(list_empty(&u->link));
}
unix_tot_inflight++;
- spin_unlock(&unix_gc_lock);
}
+ user->unix_inflight++;
+ spin_unlock(&unix_gc_lock);
}
-void unix_notinflight(struct file *fp)
+void unix_notinflight(struct user_struct *user, struct file *fp)
{
struct sock *s = unix_get_socket(fp);
+ spin_lock(&unix_gc_lock);
+
if (s) {
struct unix_sock *u = unix_sk(s);
- spin_lock(&unix_gc_lock);
BUG_ON(list_empty(&u->link));
if (atomic_long_dec_and_test(&u->inflight))
list_del_init(&u->link);
unix_tot_inflight--;
- spin_unlock(&unix_gc_lock);
}
+ user->unix_inflight--;
+ spin_unlock(&unix_gc_lock);
}
static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 7fd1220fbfa0..3dce53ebea92 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1209,10 +1209,14 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
if (signal_pending(current)) {
err = sock_intr_errno(timeout);
- goto out_wait_error;
+ sk->sk_state = SS_UNCONNECTED;
+ sock->state = SS_UNCONNECTED;
+ goto out_wait;
} else if (timeout == 0) {
err = -ETIMEDOUT;
- goto out_wait_error;
+ sk->sk_state = SS_UNCONNECTED;
+ sock->state = SS_UNCONNECTED;
+ goto out_wait;
}
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
@@ -1220,20 +1224,17 @@ static int vsock_stream_connect(struct socket *sock, struct sockaddr *addr,
if (sk->sk_err) {
err = -sk->sk_err;
- goto out_wait_error;
- } else
+ sk->sk_state = SS_UNCONNECTED;
+ sock->state = SS_UNCONNECTED;
+ } else {
err = 0;
+ }
out_wait:
finish_wait(sk_sleep(sk), &wait);
out:
release_sock(sk);
return err;
-
-out_wait_error:
- sk->sk_state = SS_UNCONNECTED;
- sock->state = SS_UNCONNECTED;
- goto out_wait;
}
static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
@@ -1270,18 +1271,20 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
listener->sk_err == 0) {
release_sock(listener);
timeout = schedule_timeout(timeout);
+ finish_wait(sk_sleep(listener), &wait);
lock_sock(listener);
if (signal_pending(current)) {
err = sock_intr_errno(timeout);
- goto out_wait;
+ goto out;
} else if (timeout == 0) {
err = -EAGAIN;
- goto out_wait;
+ goto out;
}
prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE);
}
+ finish_wait(sk_sleep(listener), &wait);
if (listener->sk_err)
err = -listener->sk_err;
@@ -1301,19 +1304,15 @@ static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
*/
if (err) {
vconnected->rejected = true;
- release_sock(connected);
- sock_put(connected);
- goto out_wait;
+ } else {
+ newsock->state = SS_CONNECTED;
+ sock_graft(connected, newsock);
}
- newsock->state = SS_CONNECTED;
- sock_graft(connected, newsock);
release_sock(connected);
sock_put(connected);
}
-out_wait:
- finish_wait(sk_sleep(listener), &wait);
out:
release_sock(listener);
return err;
@@ -1557,11 +1556,11 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
if (err < 0)
goto out;
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
while (total_written < len) {
ssize_t written;
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
while (vsock_stream_has_space(vsk) == 0 &&
sk->sk_err == 0 &&
!(sk->sk_shutdown & SEND_SHUTDOWN) &&
@@ -1570,27 +1569,33 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
/* Don't wait for non-blocking sockets. */
if (timeout == 0) {
err = -EAGAIN;
- goto out_wait;
+ finish_wait(sk_sleep(sk), &wait);
+ goto out_err;
}
err = transport->notify_send_pre_block(vsk, &send_data);
- if (err < 0)
- goto out_wait;
+ if (err < 0) {
+ finish_wait(sk_sleep(sk), &wait);
+ goto out_err;
+ }
release_sock(sk);
timeout = schedule_timeout(timeout);
lock_sock(sk);
if (signal_pending(current)) {
err = sock_intr_errno(timeout);
- goto out_wait;
+ finish_wait(sk_sleep(sk), &wait);
+ goto out_err;
} else if (timeout == 0) {
err = -EAGAIN;
- goto out_wait;
+ finish_wait(sk_sleep(sk), &wait);
+ goto out_err;
}
prepare_to_wait(sk_sleep(sk), &wait,
TASK_INTERRUPTIBLE);
}
+ finish_wait(sk_sleep(sk), &wait);
/* These checks occur both as part of and after the loop
* conditional since we need to check before and after
@@ -1598,16 +1603,16 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
*/
if (sk->sk_err) {
err = -sk->sk_err;
- goto out_wait;
+ goto out_err;
} else if ((sk->sk_shutdown & SEND_SHUTDOWN) ||
(vsk->peer_shutdown & RCV_SHUTDOWN)) {
err = -EPIPE;
- goto out_wait;
+ goto out_err;
}
err = transport->notify_send_pre_enqueue(vsk, &send_data);
if (err < 0)
- goto out_wait;
+ goto out_err;
/* Note that enqueue will only write as many bytes as are free
* in the produce queue, so we don't need to ensure len is
@@ -1620,7 +1625,7 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
len - total_written);
if (written < 0) {
err = -ENOMEM;
- goto out_wait;
+ goto out_err;
}
total_written += written;
@@ -1628,14 +1633,13 @@ static int vsock_stream_sendmsg(struct socket *sock, struct msghdr *msg,
err = transport->notify_send_post_enqueue(
vsk, written, &send_data);
if (err < 0)
- goto out_wait;
+ goto out_err;
}
-out_wait:
+out_err:
if (total_written > 0)
err = total_written;
- finish_wait(sk_sleep(sk), &wait);
out:
release_sock(sk);
return err;
@@ -1716,21 +1720,61 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
if (err < 0)
goto out;
- prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
while (1) {
- s64 ready = vsock_stream_has_data(vsk);
+ s64 ready;
- if (ready < 0) {
- /* Invalid queue pair content. XXX This should be
- * changed to a connection reset in a later change.
- */
+ prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+ ready = vsock_stream_has_data(vsk);
- err = -ENOMEM;
- goto out_wait;
- } else if (ready > 0) {
+ if (ready == 0) {
+ if (sk->sk_err != 0 ||
+ (sk->sk_shutdown & RCV_SHUTDOWN) ||
+ (vsk->peer_shutdown & SEND_SHUTDOWN)) {
+ finish_wait(sk_sleep(sk), &wait);
+ break;
+ }
+ /* Don't wait for non-blocking sockets. */
+ if (timeout == 0) {
+ err = -EAGAIN;
+ finish_wait(sk_sleep(sk), &wait);
+ break;
+ }
+
+ err = transport->notify_recv_pre_block(
+ vsk, target, &recv_data);
+ if (err < 0) {
+ finish_wait(sk_sleep(sk), &wait);
+ break;
+ }
+ release_sock(sk);
+ timeout = schedule_timeout(timeout);
+ lock_sock(sk);
+
+ if (signal_pending(current)) {
+ err = sock_intr_errno(timeout);
+ finish_wait(sk_sleep(sk), &wait);
+ break;
+ } else if (timeout == 0) {
+ err = -EAGAIN;
+ finish_wait(sk_sleep(sk), &wait);
+ break;
+ }
+ } else {
ssize_t read;
+ finish_wait(sk_sleep(sk), &wait);
+
+ if (ready < 0) {
+ /* Invalid queue pair content. XXX This should
+ * be changed to a connection reset in a later
+ * change.
+ */
+
+ err = -ENOMEM;
+ goto out;
+ }
+
err = transport->notify_recv_pre_dequeue(
vsk, target, &recv_data);
if (err < 0)
@@ -1750,42 +1794,12 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
vsk, target, read,
!(flags & MSG_PEEK), &recv_data);
if (err < 0)
- goto out_wait;
+ goto out;
if (read >= target || flags & MSG_PEEK)
break;
target -= read;
- } else {
- if (sk->sk_err != 0 || (sk->sk_shutdown & RCV_SHUTDOWN)
- || (vsk->peer_shutdown & SEND_SHUTDOWN)) {
- break;
- }
- /* Don't wait for non-blocking sockets. */
- if (timeout == 0) {
- err = -EAGAIN;
- break;
- }
-
- err = transport->notify_recv_pre_block(
- vsk, target, &recv_data);
- if (err < 0)
- break;
-
- release_sock(sk);
- timeout = schedule_timeout(timeout);
- lock_sock(sk);
-
- if (signal_pending(current)) {
- err = sock_intr_errno(timeout);
- break;
- } else if (timeout == 0) {
- err = -EAGAIN;
- break;
- }
-
- prepare_to_wait(sk_sleep(sk), &wait,
- TASK_INTERRUPTIBLE);
}
}
@@ -1816,8 +1830,6 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
err = copied;
}
-out_wait:
- finish_wait(sk_sleep(sk), &wait);
out:
release_sock(sk);
return err;
diff --git a/net/vmw_vsock/vmci_transport.c b/net/vmw_vsock/vmci_transport.c
index 400d87294de3..662bdd20a748 100644
--- a/net/vmw_vsock/vmci_transport.c
+++ b/net/vmw_vsock/vmci_transport.c
@@ -842,7 +842,7 @@ static void vmci_transport_peer_detach_cb(u32 sub_id,
* qp_handle.
*/
if (vmci_handle_is_invalid(e_payload->handle) ||
- vmci_handle_is_equal(trans->qp_handle, e_payload->handle))
+ !vmci_handle_is_equal(trans->qp_handle, e_payload->handle))
return;
/* We don't ask for delayed CBs when we subscribe to this event (we
@@ -1234,7 +1234,7 @@ vmci_transport_recv_connecting_server(struct sock *listener,
/* Callers of accept() will be be waiting on the listening socket, not
* the pending socket.
*/
- listener->sk_state_change(listener);
+ listener->sk_data_ready(listener);
return 0;
@@ -2154,7 +2154,7 @@ module_exit(vmci_transport_exit);
MODULE_AUTHOR("VMware, Inc.");
MODULE_DESCRIPTION("VMCI transport for Virtual Sockets");
-MODULE_VERSION("1.0.2.0-k");
+MODULE_VERSION("1.0.3.0-k");
MODULE_LICENSE("GPL v2");
MODULE_ALIAS("vmware_vsock");
MODULE_ALIAS_NETPROTO(PF_VSOCK);
diff --git a/net/vmw_vsock/vmci_transport.h b/net/vmw_vsock/vmci_transport.h
index 2ad46f39649f..1820e74a5752 100644
--- a/net/vmw_vsock/vmci_transport.h
+++ b/net/vmw_vsock/vmci_transport.h
@@ -121,7 +121,7 @@ struct vmci_transport {
u64 queue_pair_max_size;
u32 detach_sub_id;
union vmci_transport_notify notify;
- struct vmci_transport_notify_ops *notify_ops;
+ const struct vmci_transport_notify_ops *notify_ops;
struct list_head elem;
struct sock *sk;
spinlock_t lock; /* protects sk. */
diff --git a/net/vmw_vsock/vmci_transport_notify.c b/net/vmw_vsock/vmci_transport_notify.c
index 9b7f207f2bee..fd8cf0214d51 100644
--- a/net/vmw_vsock/vmci_transport_notify.c
+++ b/net/vmw_vsock/vmci_transport_notify.c
@@ -661,7 +661,7 @@ static void vmci_transport_notify_pkt_process_negotiate(struct sock *sk)
}
/* Socket control packet based operations. */
-struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = {
+const struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops = {
vmci_transport_notify_pkt_socket_init,
vmci_transport_notify_pkt_socket_destruct,
vmci_transport_notify_pkt_poll_in,
diff --git a/net/vmw_vsock/vmci_transport_notify.h b/net/vmw_vsock/vmci_transport_notify.h
index 7df793249b6c..3c464d394a8f 100644
--- a/net/vmw_vsock/vmci_transport_notify.h
+++ b/net/vmw_vsock/vmci_transport_notify.h
@@ -77,7 +77,8 @@ struct vmci_transport_notify_ops {
void (*process_negotiate) (struct sock *sk);
};
-extern struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops;
-extern struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops;
+extern const struct vmci_transport_notify_ops vmci_transport_notify_pkt_ops;
+extern const
+struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops;
#endif /* __VMCI_TRANSPORT_NOTIFY_H__ */
diff --git a/net/vmw_vsock/vmci_transport_notify_qstate.c b/net/vmw_vsock/vmci_transport_notify_qstate.c
index dc9c7929a2f9..21e591dafb03 100644
--- a/net/vmw_vsock/vmci_transport_notify_qstate.c
+++ b/net/vmw_vsock/vmci_transport_notify_qstate.c
@@ -419,7 +419,7 @@ vmci_transport_notify_pkt_send_pre_enqueue(
}
/* Socket always on control packet based operations. */
-struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops = {
+const struct vmci_transport_notify_ops vmci_transport_notify_pkt_q_state_ops = {
vmci_transport_notify_pkt_socket_init,
vmci_transport_notify_pkt_socket_destruct,
vmci_transport_notify_pkt_poll_in,
diff --git a/net/wireless/Kconfig b/net/wireless/Kconfig
index da72ed32f143..6c606120abfe 100644
--- a/net/wireless/Kconfig
+++ b/net/wireless/Kconfig
@@ -50,8 +50,8 @@ config CFG80211_DEVELOPER_WARNINGS
default n
help
This option enables some additional warnings that help
- cfg80211 developers and driver developers, but that can
- trigger due to races with userspace.
+ cfg80211 developers and driver developers, but beware that
+ they can also trigger due to races with userspace.
For example, when a driver reports that it was disconnected
from the AP, but the user disconnects manually at the same
@@ -61,19 +61,6 @@ config CFG80211_DEVELOPER_WARNINGS
on it (or mac80211).
-config CFG80211_REG_DEBUG
- bool "cfg80211 regulatory debugging"
- depends on CFG80211
- default n
- ---help---
- You can enable this if you want to debug regulatory changes.
- For more information on cfg80211 regulatory refer to the wireless
- wiki:
-
- http://wireless.kernel.org/en/developers/Regulatory
-
- If unsure, say N.
-
config CFG80211_CERTIFICATION_ONUS
bool "cfg80211 certification onus"
depends on CFG80211 && EXPERT
@@ -123,7 +110,7 @@ config CFG80211_REG_RELAX_NO_IR
interface which associated to an AP which userspace assumes or confirms
to be an authorized master, i.e., with radar detection support and DFS
capabilities. However, note that in order to not create daisy chain
- scenarios, this relaxation is not allowed in cases that the BSS client
+ scenarios, this relaxation is not allowed in cases where the BSS client
is associated to P2P GO and in addition the P2P GO instantiated on
a channel due to this relaxation should not allow connection from
non P2P clients.
@@ -148,7 +135,7 @@ config CFG80211_DEBUGFS
depends on CFG80211
depends on DEBUG_FS
---help---
- You can enable this if you want to debugfs entries for cfg80211.
+ You can enable this if you want debugfs entries for cfg80211.
If unsure, say N.
@@ -159,7 +146,7 @@ config CFG80211_INTERNAL_REGDB
---help---
This option generates an internal data structure representing
the wireless regulatory rules described in net/wireless/db.txt
- and includes code to query that database. This is an alternative
+ and includes code to query that database. This is an alternative
to using CRDA for defining regulatory rules for the kernel.
Using this option requires some parsing of the db.txt at build time,
@@ -172,7 +159,7 @@ config CFG80211_INTERNAL_REGDB
http://wireless.kernel.org/en/developers/Regulatory
- Most distributions have a CRDA package. So if unsure, say N.
+ Most distributions have a CRDA package. So if unsure, say N.
config CFG80211_CRDA_SUPPORT
bool "support CRDA" if CFG80211_INTERNAL_REGDB
diff --git a/net/wireless/core.c b/net/wireless/core.c
index b0915515640e..9f1c4aa851ef 100644
--- a/net/wireless/core.c
+++ b/net/wireless/core.c
@@ -352,6 +352,16 @@ struct wiphy *wiphy_new_nm(const struct cfg80211_ops *ops, int sizeof_priv,
WARN_ON(ops->add_station && !ops->del_station);
WARN_ON(ops->add_mpath && !ops->del_mpath);
WARN_ON(ops->join_mesh && !ops->leave_mesh);
+ WARN_ON(ops->start_p2p_device && !ops->stop_p2p_device);
+ WARN_ON(ops->start_ap && !ops->stop_ap);
+ WARN_ON(ops->join_ocb && !ops->leave_ocb);
+ WARN_ON(ops->suspend && !ops->resume);
+ WARN_ON(ops->sched_scan_start && !ops->sched_scan_stop);
+ WARN_ON(ops->remain_on_channel && !ops->cancel_remain_on_channel);
+ WARN_ON(ops->tdls_channel_switch && !ops->tdls_cancel_channel_switch);
+ WARN_ON(ops->add_tx_ts && !ops->del_tx_ts);
+ WARN_ON(ops->set_tx_power && !ops->get_tx_power);
+ WARN_ON(ops->set_antenna && !ops->get_antenna);
alloc_size = sizeof(*rdev) + sizeof_priv;
@@ -1147,6 +1157,8 @@ static int cfg80211_netdev_notifier_call(struct notifier_block *nb,
return NOTIFY_DONE;
}
+ wireless_nlevent_flush();
+
return NOTIFY_OK;
}
diff --git a/net/wireless/core.h b/net/wireless/core.h
index a618b4b86fa4..022ccad06cbe 100644
--- a/net/wireless/core.h
+++ b/net/wireless/core.h
@@ -416,13 +416,6 @@ int cfg80211_change_iface(struct cfg80211_registered_device *rdev,
void cfg80211_process_rdev_events(struct cfg80211_registered_device *rdev);
void cfg80211_process_wdev_events(struct wireless_dev *wdev);
-int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev,
- struct wireless_dev *wdev,
- enum nl80211_iftype iftype,
- struct ieee80211_channel *chan,
- enum cfg80211_chan_mode chanmode,
- u8 radar_detect);
-
/**
* cfg80211_chandef_dfs_usable - checks if chandef is DFS usable
* @wiphy: the wiphy to validate against
diff --git a/net/wireless/lib80211_crypt_ccmp.c b/net/wireless/lib80211_crypt_ccmp.c
index dc0e59e53dbf..6beab0cfcb99 100644
--- a/net/wireless/lib80211_crypt_ccmp.c
+++ b/net/wireless/lib80211_crypt_ccmp.c
@@ -311,8 +311,8 @@ static int lib80211_ccmp_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
}
keyidx >>= 6;
if (key->key_idx != keyidx) {
- printk(KERN_DEBUG "CCMP: RX tkey->key_idx=%d frame "
- "keyidx=%d priv=%p\n", key->key_idx, keyidx, priv);
+ net_dbg_ratelimited("CCMP: RX tkey->key_idx=%d frame keyidx=%d\n",
+ key->key_idx, keyidx);
return -6;
}
if (!key->key_set) {
diff --git a/net/wireless/lib80211_crypt_tkip.c b/net/wireless/lib80211_crypt_tkip.c
index 8c90ba79e56e..71447cf86306 100644
--- a/net/wireless/lib80211_crypt_tkip.c
+++ b/net/wireless/lib80211_crypt_tkip.c
@@ -29,7 +29,8 @@
#include <linux/ieee80211.h>
#include <net/iw_handler.h>
-#include <linux/crypto.h>
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
#include <linux/crc32.h>
#include <net/lib80211.h>
@@ -63,10 +64,10 @@ struct lib80211_tkip_data {
int key_idx;
- struct crypto_blkcipher *rx_tfm_arc4;
- struct crypto_hash *rx_tfm_michael;
- struct crypto_blkcipher *tx_tfm_arc4;
- struct crypto_hash *tx_tfm_michael;
+ struct crypto_skcipher *rx_tfm_arc4;
+ struct crypto_ahash *rx_tfm_michael;
+ struct crypto_skcipher *tx_tfm_arc4;
+ struct crypto_ahash *tx_tfm_michael;
/* scratch buffers for virt_to_page() (crypto API) */
u8 rx_hdr[16], tx_hdr[16];
@@ -98,29 +99,29 @@ static void *lib80211_tkip_init(int key_idx)
priv->key_idx = key_idx;
- priv->tx_tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0,
- CRYPTO_ALG_ASYNC);
+ priv->tx_tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0,
+ CRYPTO_ALG_ASYNC);
if (IS_ERR(priv->tx_tfm_arc4)) {
priv->tx_tfm_arc4 = NULL;
goto fail;
}
- priv->tx_tfm_michael = crypto_alloc_hash("michael_mic", 0,
- CRYPTO_ALG_ASYNC);
+ priv->tx_tfm_michael = crypto_alloc_ahash("michael_mic", 0,
+ CRYPTO_ALG_ASYNC);
if (IS_ERR(priv->tx_tfm_michael)) {
priv->tx_tfm_michael = NULL;
goto fail;
}
- priv->rx_tfm_arc4 = crypto_alloc_blkcipher("ecb(arc4)", 0,
- CRYPTO_ALG_ASYNC);
+ priv->rx_tfm_arc4 = crypto_alloc_skcipher("ecb(arc4)", 0,
+ CRYPTO_ALG_ASYNC);
if (IS_ERR(priv->rx_tfm_arc4)) {
priv->rx_tfm_arc4 = NULL;
goto fail;
}
- priv->rx_tfm_michael = crypto_alloc_hash("michael_mic", 0,
- CRYPTO_ALG_ASYNC);
+ priv->rx_tfm_michael = crypto_alloc_ahash("michael_mic", 0,
+ CRYPTO_ALG_ASYNC);
if (IS_ERR(priv->rx_tfm_michael)) {
priv->rx_tfm_michael = NULL;
goto fail;
@@ -130,14 +131,10 @@ static void *lib80211_tkip_init(int key_idx)
fail:
if (priv) {
- if (priv->tx_tfm_michael)
- crypto_free_hash(priv->tx_tfm_michael);
- if (priv->tx_tfm_arc4)
- crypto_free_blkcipher(priv->tx_tfm_arc4);
- if (priv->rx_tfm_michael)
- crypto_free_hash(priv->rx_tfm_michael);
- if (priv->rx_tfm_arc4)
- crypto_free_blkcipher(priv->rx_tfm_arc4);
+ crypto_free_ahash(priv->tx_tfm_michael);
+ crypto_free_skcipher(priv->tx_tfm_arc4);
+ crypto_free_ahash(priv->rx_tfm_michael);
+ crypto_free_skcipher(priv->rx_tfm_arc4);
kfree(priv);
}
@@ -148,14 +145,10 @@ static void lib80211_tkip_deinit(void *priv)
{
struct lib80211_tkip_data *_priv = priv;
if (_priv) {
- if (_priv->tx_tfm_michael)
- crypto_free_hash(_priv->tx_tfm_michael);
- if (_priv->tx_tfm_arc4)
- crypto_free_blkcipher(_priv->tx_tfm_arc4);
- if (_priv->rx_tfm_michael)
- crypto_free_hash(_priv->rx_tfm_michael);
- if (_priv->rx_tfm_arc4)
- crypto_free_blkcipher(_priv->rx_tfm_arc4);
+ crypto_free_ahash(_priv->tx_tfm_michael);
+ crypto_free_skcipher(_priv->tx_tfm_arc4);
+ crypto_free_ahash(_priv->rx_tfm_michael);
+ crypto_free_skcipher(_priv->rx_tfm_arc4);
}
kfree(priv);
}
@@ -353,11 +346,12 @@ static int lib80211_tkip_hdr(struct sk_buff *skb, int hdr_len,
static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
{
struct lib80211_tkip_data *tkey = priv;
- struct blkcipher_desc desc = { .tfm = tkey->tx_tfm_arc4 };
+ SKCIPHER_REQUEST_ON_STACK(req, tkey->tx_tfm_arc4);
int len;
u8 rc4key[16], *pos, *icv;
u32 crc;
struct scatterlist sg;
+ int err;
if (tkey->flags & IEEE80211_CRYPTO_TKIP_COUNTERMEASURES) {
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *)skb->data;
@@ -382,9 +376,14 @@ static int lib80211_tkip_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
icv[2] = crc >> 16;
icv[3] = crc >> 24;
- crypto_blkcipher_setkey(tkey->tx_tfm_arc4, rc4key, 16);
+ crypto_skcipher_setkey(tkey->tx_tfm_arc4, rc4key, 16);
sg_init_one(&sg, pos, len + 4);
- return crypto_blkcipher_encrypt(&desc, &sg, &sg, len + 4);
+ skcipher_request_set_tfm(req, tkey->tx_tfm_arc4);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &sg, &sg, len + 4, NULL);
+ err = crypto_skcipher_encrypt(req);
+ skcipher_request_zero(req);
+ return err;
}
/*
@@ -403,7 +402,7 @@ static inline int tkip_replay_check(u32 iv32_n, u16 iv16_n,
static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
{
struct lib80211_tkip_data *tkey = priv;
- struct blkcipher_desc desc = { .tfm = tkey->rx_tfm_arc4 };
+ SKCIPHER_REQUEST_ON_STACK(req, tkey->rx_tfm_arc4);
u8 rc4key[16];
u8 keyidx, *pos;
u32 iv32;
@@ -413,6 +412,7 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
u32 crc;
struct scatterlist sg;
int plen;
+ int err;
hdr = (struct ieee80211_hdr *)skb->data;
@@ -434,8 +434,8 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
}
keyidx >>= 6;
if (tkey->key_idx != keyidx) {
- printk(KERN_DEBUG "TKIP: RX tkey->key_idx=%d frame "
- "keyidx=%d priv=%p\n", tkey->key_idx, keyidx, priv);
+ net_dbg_ratelimited("TKIP: RX tkey->key_idx=%d frame keyidx=%d\n",
+ tkey->key_idx, keyidx);
return -6;
}
if (!tkey->key_set) {
@@ -465,9 +465,14 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
plen = skb->len - hdr_len - 12;
- crypto_blkcipher_setkey(tkey->rx_tfm_arc4, rc4key, 16);
+ crypto_skcipher_setkey(tkey->rx_tfm_arc4, rc4key, 16);
sg_init_one(&sg, pos, plen + 4);
- if (crypto_blkcipher_decrypt(&desc, &sg, &sg, plen + 4)) {
+ skcipher_request_set_tfm(req, tkey->rx_tfm_arc4);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &sg, &sg, plen + 4, NULL);
+ err = crypto_skcipher_decrypt(req);
+ skcipher_request_zero(req);
+ if (err) {
net_dbg_ratelimited("TKIP: failed to decrypt received packet from %pM\n",
hdr->addr2);
return -7;
@@ -505,11 +510,12 @@ static int lib80211_tkip_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
return keyidx;
}
-static int michael_mic(struct crypto_hash *tfm_michael, u8 * key, u8 * hdr,
+static int michael_mic(struct crypto_ahash *tfm_michael, u8 * key, u8 * hdr,
u8 * data, size_t data_len, u8 * mic)
{
- struct hash_desc desc;
+ AHASH_REQUEST_ON_STACK(req, tfm_michael);
struct scatterlist sg[2];
+ int err;
if (tfm_michael == NULL) {
pr_warn("%s(): tfm_michael == NULL\n", __func__);
@@ -519,12 +525,15 @@ static int michael_mic(struct crypto_hash *tfm_michael, u8 * key, u8 * hdr,
sg_set_buf(&sg[0], hdr, 16);
sg_set_buf(&sg[1], data, data_len);
- if (crypto_hash_setkey(tfm_michael, key, 8))
+ if (crypto_ahash_setkey(tfm_michael, key, 8))
return -1;
- desc.tfm = tfm_michael;
- desc.flags = 0;
- return crypto_hash_digest(&desc, sg, data_len + 16, mic);
+ ahash_request_set_tfm(req, tfm_michael);
+ ahash_request_set_callback(req, 0, NULL, NULL);
+ ahash_request_set_crypt(req, sg, mic, data_len + 16);
+ err = crypto_ahash_digest(req);
+ ahash_request_zero(req);
+ return err;
}
static void michael_mic_hdr(struct sk_buff *skb, u8 * hdr)
@@ -645,10 +654,10 @@ static int lib80211_tkip_set_key(void *key, int len, u8 * seq, void *priv)
{
struct lib80211_tkip_data *tkey = priv;
int keyidx;
- struct crypto_hash *tfm = tkey->tx_tfm_michael;
- struct crypto_blkcipher *tfm2 = tkey->tx_tfm_arc4;
- struct crypto_hash *tfm3 = tkey->rx_tfm_michael;
- struct crypto_blkcipher *tfm4 = tkey->rx_tfm_arc4;
+ struct crypto_ahash *tfm = tkey->tx_tfm_michael;
+ struct crypto_skcipher *tfm2 = tkey->tx_tfm_arc4;
+ struct crypto_ahash *tfm3 = tkey->rx_tfm_michael;
+ struct crypto_skcipher *tfm4 = tkey->rx_tfm_arc4;
keyidx = tkey->key_idx;
memset(tkey, 0, sizeof(*tkey));
diff --git a/net/wireless/lib80211_crypt_wep.c b/net/wireless/lib80211_crypt_wep.c
index 1c292e4ea7b6..d05f58b0fd04 100644
--- a/net/wireless/lib80211_crypt_wep.c
+++ b/net/wireless/lib80211_crypt_wep.c
@@ -22,7 +22,7 @@
#include <net/lib80211.h>
-#include <linux/crypto.h>
+#include <crypto/skcipher.h>
#include <linux/crc32.h>
MODULE_AUTHOR("Jouni Malinen");
@@ -35,8 +35,8 @@ struct lib80211_wep_data {
u8 key[WEP_KEY_LEN + 1];
u8 key_len;
u8 key_idx;
- struct crypto_blkcipher *tx_tfm;
- struct crypto_blkcipher *rx_tfm;
+ struct crypto_skcipher *tx_tfm;
+ struct crypto_skcipher *rx_tfm;
};
static void *lib80211_wep_init(int keyidx)
@@ -48,13 +48,13 @@ static void *lib80211_wep_init(int keyidx)
goto fail;
priv->key_idx = keyidx;
- priv->tx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+ priv->tx_tfm = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(priv->tx_tfm)) {
priv->tx_tfm = NULL;
goto fail;
}
- priv->rx_tfm = crypto_alloc_blkcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
+ priv->rx_tfm = crypto_alloc_skcipher("ecb(arc4)", 0, CRYPTO_ALG_ASYNC);
if (IS_ERR(priv->rx_tfm)) {
priv->rx_tfm = NULL;
goto fail;
@@ -66,10 +66,8 @@ static void *lib80211_wep_init(int keyidx)
fail:
if (priv) {
- if (priv->tx_tfm)
- crypto_free_blkcipher(priv->tx_tfm);
- if (priv->rx_tfm)
- crypto_free_blkcipher(priv->rx_tfm);
+ crypto_free_skcipher(priv->tx_tfm);
+ crypto_free_skcipher(priv->rx_tfm);
kfree(priv);
}
return NULL;
@@ -79,10 +77,8 @@ static void lib80211_wep_deinit(void *priv)
{
struct lib80211_wep_data *_priv = priv;
if (_priv) {
- if (_priv->tx_tfm)
- crypto_free_blkcipher(_priv->tx_tfm);
- if (_priv->rx_tfm)
- crypto_free_blkcipher(_priv->rx_tfm);
+ crypto_free_skcipher(_priv->tx_tfm);
+ crypto_free_skcipher(_priv->rx_tfm);
}
kfree(priv);
}
@@ -133,11 +129,12 @@ static int lib80211_wep_build_iv(struct sk_buff *skb, int hdr_len,
static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
{
struct lib80211_wep_data *wep = priv;
- struct blkcipher_desc desc = { .tfm = wep->tx_tfm };
+ SKCIPHER_REQUEST_ON_STACK(req, wep->tx_tfm);
u32 crc, klen, len;
u8 *pos, *icv;
struct scatterlist sg;
u8 key[WEP_KEY_LEN + 3];
+ int err;
/* other checks are in lib80211_wep_build_iv */
if (skb_tailroom(skb) < 4)
@@ -165,9 +162,14 @@ static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
icv[2] = crc >> 16;
icv[3] = crc >> 24;
- crypto_blkcipher_setkey(wep->tx_tfm, key, klen);
+ crypto_skcipher_setkey(wep->tx_tfm, key, klen);
sg_init_one(&sg, pos, len + 4);
- return crypto_blkcipher_encrypt(&desc, &sg, &sg, len + 4);
+ skcipher_request_set_tfm(req, wep->tx_tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &sg, &sg, len + 4, NULL);
+ err = crypto_skcipher_encrypt(req);
+ skcipher_request_zero(req);
+ return err;
}
/* Perform WEP decryption on given buffer. Buffer includes whole WEP part of
@@ -180,11 +182,12 @@ static int lib80211_wep_encrypt(struct sk_buff *skb, int hdr_len, void *priv)
static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
{
struct lib80211_wep_data *wep = priv;
- struct blkcipher_desc desc = { .tfm = wep->rx_tfm };
+ SKCIPHER_REQUEST_ON_STACK(req, wep->rx_tfm);
u32 crc, klen, plen;
u8 key[WEP_KEY_LEN + 3];
u8 keyidx, *pos, icv[4];
struct scatterlist sg;
+ int err;
if (skb->len < hdr_len + 8)
return -1;
@@ -205,9 +208,14 @@ static int lib80211_wep_decrypt(struct sk_buff *skb, int hdr_len, void *priv)
/* Apply RC4 to data and compute CRC32 over decrypted data */
plen = skb->len - hdr_len - 8;
- crypto_blkcipher_setkey(wep->rx_tfm, key, klen);
+ crypto_skcipher_setkey(wep->rx_tfm, key, klen);
sg_init_one(&sg, pos, plen + 4);
- if (crypto_blkcipher_decrypt(&desc, &sg, &sg, plen + 4))
+ skcipher_request_set_tfm(req, wep->rx_tfm);
+ skcipher_request_set_callback(req, 0, NULL, NULL);
+ skcipher_request_set_crypt(req, &sg, &sg, plen + 4, NULL);
+ err = crypto_skcipher_decrypt(req);
+ skcipher_request_zero(req);
+ if (err)
return -7;
crc = ~crc32_le(~0, pos, plen);
diff --git a/net/wireless/mlme.c b/net/wireless/mlme.c
index fb44fa3bf4ef..ff328250bc44 100644
--- a/net/wireless/mlme.c
+++ b/net/wireless/mlme.c
@@ -711,7 +711,7 @@ EXPORT_SYMBOL(cfg80211_rx_mgmt);
void cfg80211_dfs_channels_update_work(struct work_struct *work)
{
- struct delayed_work *delayed_work;
+ struct delayed_work *delayed_work = to_delayed_work(work);
struct cfg80211_registered_device *rdev;
struct cfg80211_chan_def chandef;
struct ieee80211_supported_band *sband;
@@ -721,7 +721,6 @@ void cfg80211_dfs_channels_update_work(struct work_struct *work)
unsigned long timeout, next_time = 0;
int bandid, i;
- delayed_work = container_of(work, struct delayed_work, work);
rdev = container_of(delayed_work, struct cfg80211_registered_device,
dfs_update_channels_wk);
wiphy = &rdev->wiphy;
diff --git a/net/wireless/nl80211.c b/net/wireless/nl80211.c
index c71e274c810a..98c924260b3d 100644
--- a/net/wireless/nl80211.c
+++ b/net/wireless/nl80211.c
@@ -3,7 +3,7 @@
*
* Copyright 2006-2010 Johannes Berg <johannes@sipsolutions.net>
* Copyright 2013-2014 Intel Mobile Communications GmbH
- * Copyright 2015 Intel Deutschland GmbH
+ * Copyright 2015-2016 Intel Deutschland GmbH
*/
#include <linux/if.h>
@@ -401,6 +401,7 @@ static const struct nla_policy nl80211_policy[NUM_NL80211_ATTR] = {
[NL80211_ATTR_NETNS_FD] = { .type = NLA_U32 },
[NL80211_ATTR_SCHED_SCAN_DELAY] = { .type = NLA_U32 },
[NL80211_ATTR_REG_INDOOR] = { .type = NLA_FLAG },
+ [NL80211_ATTR_PBSS] = { .type = NLA_FLAG },
};
/* policy for the key attributes */
@@ -3461,6 +3462,10 @@ static int nl80211_start_ap(struct sk_buff *skb, struct genl_info *info)
return PTR_ERR(params.acl);
}
+ params.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
+ if (params.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ])
+ return -EOPNOTSUPP;
+
wdev_lock(wdev);
err = rdev_start_ap(rdev, dev, &params);
if (!err) {
@@ -4256,8 +4261,8 @@ static int nl80211_set_station(struct sk_buff *skb, struct genl_info *info)
* station. Include these parameters here and will check them in
* cfg80211_check_station_change().
*/
- if (info->attrs[NL80211_ATTR_PEER_AID])
- params.aid = nla_get_u16(info->attrs[NL80211_ATTR_PEER_AID]);
+ if (info->attrs[NL80211_ATTR_STA_AID])
+ params.aid = nla_get_u16(info->attrs[NL80211_ATTR_STA_AID]);
if (info->attrs[NL80211_ATTR_STA_LISTEN_INTERVAL])
params.listen_interval =
@@ -4359,6 +4364,8 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
struct net_device *dev = info->user_ptr[1];
struct station_parameters params;
u8 *mac_addr = NULL;
+ u32 auth_assoc = BIT(NL80211_STA_FLAG_AUTHENTICATED) |
+ BIT(NL80211_STA_FLAG_ASSOCIATED);
memset(&params, 0, sizeof(params));
@@ -4470,10 +4477,23 @@ static int nl80211_new_station(struct sk_buff *skb, struct genl_info *info)
/* allow authenticated/associated only if driver handles it */
if (!(rdev->wiphy.features &
NL80211_FEATURE_FULL_AP_CLIENT_STATE) &&
- params.sta_flags_mask &
- (BIT(NL80211_STA_FLAG_AUTHENTICATED) |
- BIT(NL80211_STA_FLAG_ASSOCIATED)))
- return -EINVAL;
+ params.sta_flags_mask & auth_assoc)
+ return -EINVAL;
+
+ /* Older userspace, or userspace wanting to be compatible with
+ * !NL80211_FEATURE_FULL_AP_CLIENT_STATE, will not set the auth
+ * and assoc flags in the mask, but assumes the station will be
+ * added as associated anyway since this was the required driver
+ * behaviour before NL80211_FEATURE_FULL_AP_CLIENT_STATE was
+ * introduced.
+ * In order to not bother drivers with this quirk in the API
+ * set the flags in both the mask and set for new stations in
+ * this case.
+ */
+ if (!(params.sta_flags_mask & auth_assoc)) {
+ params.sta_flags_mask |= auth_assoc;
+ params.sta_flags_set |= auth_assoc;
+ }
/* must be last in here for error handling */
params.vlan = get_vlan(info, rdev);
@@ -5997,6 +6017,24 @@ static int nl80211_trigger_scan(struct sk_buff *skb, struct genl_info *info)
return err;
}
+static int nl80211_abort_scan(struct sk_buff *skb, struct genl_info *info)
+{
+ struct cfg80211_registered_device *rdev = info->user_ptr[0];
+ struct wireless_dev *wdev = info->user_ptr[1];
+
+ if (!rdev->ops->abort_scan)
+ return -EOPNOTSUPP;
+
+ if (rdev->scan_msg)
+ return 0;
+
+ if (!rdev->scan_req)
+ return -ENOENT;
+
+ rdev_abort_scan(rdev, wdev);
+ return 0;
+}
+
static int
nl80211_parse_sched_scan_plans(struct wiphy *wiphy, int n_plans,
struct cfg80211_sched_scan_request *request,
@@ -6507,8 +6545,7 @@ static int nl80211_start_radar_detection(struct sk_buff *skb,
if (WARN_ON(!cac_time_ms))
cac_time_ms = IEEE80211_DFS_MIN_CAC_TIME_MS;
- err = rdev->ops->start_radar_detection(&rdev->wiphy, dev, &chandef,
- cac_time_ms);
+ err = rdev_start_radar_detection(rdev, dev, &chandef, cac_time_ms);
if (!err) {
wdev->chandef = chandef;
wdev->cac_started = true;
@@ -7249,9 +7286,11 @@ static int nl80211_associate(struct sk_buff *skb, struct genl_info *info)
}
if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
- if (!(rdev->wiphy.features &
- NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) ||
- !(rdev->wiphy.features & NL80211_FEATURE_QUIET))
+ if (!((rdev->wiphy.features &
+ NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) &&
+ (rdev->wiphy.features & NL80211_FEATURE_QUIET)) &&
+ !wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_RRM))
return -EINVAL;
req.flags |= ASSOC_REQ_USE_RRM;
}
@@ -7515,7 +7554,7 @@ static int nl80211_join_ibss(struct sk_buff *skb, struct genl_info *info)
if ((ibss.chandef.width != NL80211_CHAN_WIDTH_20_NOHT) &&
no_ht) {
- kfree(connkeys);
+ kzfree(connkeys);
return -EINVAL;
}
}
@@ -7571,7 +7610,7 @@ static int nl80211_set_mcast_rate(struct sk_buff *skb, struct genl_info *info)
if (!nl80211_parse_mcast_rate(rdev, mcast_rate, nla_rate))
return -EINVAL;
- err = rdev->ops->set_mcast_rate(&rdev->wiphy, dev, mcast_rate);
+ err = rdev_set_mcast_rate(rdev, dev, mcast_rate);
return err;
}
@@ -7939,13 +7978,23 @@ static int nl80211_connect(struct sk_buff *skb, struct genl_info *info)
}
if (nla_get_flag(info->attrs[NL80211_ATTR_USE_RRM])) {
- if (!(rdev->wiphy.features &
- NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) ||
- !(rdev->wiphy.features & NL80211_FEATURE_QUIET))
+ if (!((rdev->wiphy.features &
+ NL80211_FEATURE_DS_PARAM_SET_IE_IN_PROBES) &&
+ (rdev->wiphy.features & NL80211_FEATURE_QUIET)) &&
+ !wiphy_ext_feature_isset(&rdev->wiphy,
+ NL80211_EXT_FEATURE_RRM)) {
+ kzfree(connkeys);
return -EINVAL;
+ }
connect.flags |= ASSOC_REQ_USE_RRM;
}
+ connect.pbss = nla_get_flag(info->attrs[NL80211_ATTR_PBSS]);
+ if (connect.pbss && !rdev->wiphy.bands[IEEE80211_BAND_60GHZ]) {
+ kzfree(connkeys);
+ return -EOPNOTSUPP;
+ }
+
wdev_lock(dev->ieee80211_ptr);
err = cfg80211_connect(rdev, dev, &connect, connkeys, NULL);
wdev_unlock(dev->ieee80211_ptr);
@@ -9503,6 +9552,7 @@ static int nl80211_set_wowlan(struct sk_buff *skb, struct genl_info *info)
if (new_triggers.tcp && new_triggers.tcp->sock)
sock_release(new_triggers.tcp->sock);
kfree(new_triggers.tcp);
+ kfree(new_triggers.nd_config);
return err;
}
#endif
@@ -9716,7 +9766,7 @@ static int nl80211_set_coalesce(struct sk_buff *skb, struct genl_info *info)
if (!info->attrs[NL80211_ATTR_COALESCE_RULE]) {
cfg80211_rdev_free_coalesce(rdev);
- rdev->ops->set_coalesce(&rdev->wiphy, NULL);
+ rdev_set_coalesce(rdev, NULL);
return 0;
}
@@ -9744,7 +9794,7 @@ static int nl80211_set_coalesce(struct sk_buff *skb, struct genl_info *info)
i++;
}
- err = rdev->ops->set_coalesce(&rdev->wiphy, &new_coalesce);
+ err = rdev_set_coalesce(rdev, &new_coalesce);
if (err)
goto error;
@@ -10946,6 +10996,14 @@ static const struct genl_ops nl80211_ops[] = {
NL80211_FLAG_NEED_RTNL,
},
{
+ .cmd = NL80211_CMD_ABORT_SCAN,
+ .doit = nl80211_abort_scan,
+ .policy = nl80211_policy,
+ .flags = GENL_ADMIN_PERM,
+ .internal_flags = NL80211_FLAG_NEED_WDEV_UP |
+ NL80211_FLAG_NEED_RTNL,
+ },
+ {
.cmd = NL80211_CMD_GET_SCAN,
.policy = nl80211_policy,
.dumpit = nl80211_dump_scan,
diff --git a/net/wireless/ocb.c b/net/wireless/ocb.c
index c00d4a792319..e64dbf16330c 100644
--- a/net/wireless/ocb.c
+++ b/net/wireless/ocb.c
@@ -29,6 +29,9 @@ int __cfg80211_join_ocb(struct cfg80211_registered_device *rdev,
if (dev->ieee80211_ptr->iftype != NL80211_IFTYPE_OCB)
return -EOPNOTSUPP;
+ if (!rdev->ops->join_ocb)
+ return -EOPNOTSUPP;
+
if (WARN_ON(!setup->chandef.chan))
return -EINVAL;
diff --git a/net/wireless/radiotap.c b/net/wireless/radiotap.c
index 722da616438c..6582d155e2fc 100644
--- a/net/wireless/radiotap.c
+++ b/net/wireless/radiotap.c
@@ -43,6 +43,7 @@ static const struct radiotap_align_size rtap_namespace_sizes[] = {
[IEEE80211_RADIOTAP_DATA_RETRIES] = { .align = 1, .size = 1, },
[IEEE80211_RADIOTAP_MCS] = { .align = 1, .size = 3, },
[IEEE80211_RADIOTAP_AMPDU_STATUS] = { .align = 4, .size = 8, },
+ [IEEE80211_RADIOTAP_VHT] = { .align = 2, .size = 12, },
/*
* add more here as they are defined in radiotap.h
*/
diff --git a/net/wireless/rdev-ops.h b/net/wireless/rdev-ops.h
index c23516d0f807..8ae0c04f9fc7 100644
--- a/net/wireless/rdev-ops.h
+++ b/net/wireless/rdev-ops.h
@@ -427,6 +427,14 @@ static inline int rdev_scan(struct cfg80211_registered_device *rdev,
return ret;
}
+static inline void rdev_abort_scan(struct cfg80211_registered_device *rdev,
+ struct wireless_dev *wdev)
+{
+ trace_rdev_abort_scan(&rdev->wiphy, wdev);
+ rdev->ops->abort_scan(&rdev->wiphy, wdev);
+ trace_rdev_return_void(&rdev->wiphy);
+}
+
static inline int rdev_auth(struct cfg80211_registered_device *rdev,
struct net_device *dev,
struct cfg80211_auth_request *req)
@@ -1020,4 +1028,47 @@ rdev_tdls_cancel_channel_switch(struct cfg80211_registered_device *rdev,
trace_rdev_return_void(&rdev->wiphy);
}
+static inline int
+rdev_start_radar_detection(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ struct cfg80211_chan_def *chandef,
+ u32 cac_time_ms)
+{
+ int ret = -ENOTSUPP;
+
+ trace_rdev_start_radar_detection(&rdev->wiphy, dev, chandef,
+ cac_time_ms);
+ if (rdev->ops->start_radar_detection)
+ ret = rdev->ops->start_radar_detection(&rdev->wiphy, dev,
+ chandef, cac_time_ms);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_mcast_rate(struct cfg80211_registered_device *rdev,
+ struct net_device *dev,
+ int mcast_rate[IEEE80211_NUM_BANDS])
+{
+ int ret = -ENOTSUPP;
+
+ trace_rdev_set_mcast_rate(&rdev->wiphy, dev, mcast_rate);
+ if (rdev->ops->set_mcast_rate)
+ ret = rdev->ops->set_mcast_rate(&rdev->wiphy, dev, mcast_rate);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
+
+static inline int
+rdev_set_coalesce(struct cfg80211_registered_device *rdev,
+ struct cfg80211_coalesce *coalesce)
+{
+ int ret = -ENOTSUPP;
+
+ trace_rdev_set_coalesce(&rdev->wiphy, coalesce);
+ if (rdev->ops->set_coalesce)
+ ret = rdev->ops->set_coalesce(&rdev->wiphy, coalesce);
+ trace_rdev_return_int(&rdev->wiphy, ret);
+ return ret;
+}
#endif /* __CFG80211_RDEV_OPS */
diff --git a/net/wireless/reg.c b/net/wireless/reg.c
index 2e8d6f39ed56..c5fb317eee68 100644
--- a/net/wireless/reg.c
+++ b/net/wireless/reg.c
@@ -60,13 +60,6 @@
#include "regdb.h"
#include "nl80211.h"
-#ifdef CONFIG_CFG80211_REG_DEBUG
-#define REG_DBG_PRINT(format, args...) \
- printk(KERN_DEBUG pr_fmt(format), ##args)
-#else
-#define REG_DBG_PRINT(args...)
-#endif
-
/*
* Grace period we give before making sure all current interfaces reside on
* channels allowed by the current regulatory domain.
@@ -178,12 +171,10 @@ enum nl80211_dfs_regions reg_get_dfs_region(struct wiphy *wiphy)
if (wiphy_regd->dfs_region == regd->dfs_region)
goto out;
- REG_DBG_PRINT("%s: device specific dfs_region "
- "(%s) disagrees with cfg80211's "
- "central dfs_region (%s)\n",
- dev_name(&wiphy->dev),
- reg_dfs_region_str(wiphy_regd->dfs_region),
- reg_dfs_region_str(regd->dfs_region));
+ pr_debug("%s: device specific dfs_region (%s) disagrees with cfg80211's central dfs_region (%s)\n",
+ dev_name(&wiphy->dev),
+ reg_dfs_region_str(wiphy_regd->dfs_region),
+ reg_dfs_region_str(regd->dfs_region));
out:
return regd->dfs_region;
@@ -231,20 +222,22 @@ static const struct ieee80211_regdomain world_regdom = {
/* IEEE 802.11b/g, channels 1..11 */
REG_RULE(2412-10, 2462+10, 40, 6, 20, 0),
/* IEEE 802.11b/g, channels 12..13. */
- REG_RULE(2467-10, 2472+10, 40, 6, 20,
- NL80211_RRF_NO_IR),
+ REG_RULE(2467-10, 2472+10, 20, 6, 20,
+ NL80211_RRF_NO_IR | NL80211_RRF_AUTO_BW),
/* IEEE 802.11 channel 14 - Only JP enables
* this and for 802.11b only */
REG_RULE(2484-10, 2484+10, 20, 6, 20,
NL80211_RRF_NO_IR |
NL80211_RRF_NO_OFDM),
/* IEEE 802.11a, channel 36..48 */
- REG_RULE(5180-10, 5240+10, 160, 6, 20,
- NL80211_RRF_NO_IR),
+ REG_RULE(5180-10, 5240+10, 80, 6, 20,
+ NL80211_RRF_NO_IR |
+ NL80211_RRF_AUTO_BW),
/* IEEE 802.11a, channel 52..64 - DFS required */
- REG_RULE(5260-10, 5320+10, 160, 6, 20,
+ REG_RULE(5260-10, 5320+10, 80, 6, 20,
NL80211_RRF_NO_IR |
+ NL80211_RRF_AUTO_BW |
NL80211_RRF_DFS),
/* IEEE 802.11a, channel 100..144 - DFS required */
@@ -541,7 +534,7 @@ static DECLARE_DELAYED_WORK(crda_timeout, crda_timeout_work);
static void crda_timeout_work(struct work_struct *work)
{
- REG_DBG_PRINT("Timeout while waiting for CRDA to reply, restoring regulatory settings\n");
+ pr_debug("Timeout while waiting for CRDA to reply, restoring regulatory settings\n");
rtnl_lock();
reg_crda_timeouts++;
restore_regulatory_settings(true);
@@ -583,7 +576,7 @@ static int call_crda(const char *alpha2)
if (!is_world_regdom((char *) alpha2))
pr_debug("Calling CRDA for country: %c%c\n",
- alpha2[0], alpha2[1]);
+ alpha2[0], alpha2[1]);
else
pr_debug("Calling CRDA to update world regulatory domain\n");
@@ -1052,7 +1045,7 @@ static u32 map_regdom_flags(u32 rd_flags)
}
static const struct ieee80211_reg_rule *
-freq_reg_info_regd(struct wiphy *wiphy, u32 center_freq,
+freq_reg_info_regd(u32 center_freq,
const struct ieee80211_regdomain *regd, u32 bw)
{
int i;
@@ -1097,7 +1090,7 @@ __freq_reg_info(struct wiphy *wiphy, u32 center_freq, u32 min_bw)
u32 bw;
for (bw = MHZ_TO_KHZ(20); bw >= min_bw; bw = bw / 2) {
- reg_rule = freq_reg_info_regd(wiphy, center_freq, regd, bw);
+ reg_rule = freq_reg_info_regd(center_freq, regd, bw);
if (!IS_ERR(reg_rule))
return reg_rule;
}
@@ -1130,40 +1123,39 @@ const char *reg_initiator_name(enum nl80211_reg_initiator initiator)
}
EXPORT_SYMBOL(reg_initiator_name);
-static void chan_reg_rule_print_dbg(const struct ieee80211_regdomain *regd,
- struct ieee80211_channel *chan,
- const struct ieee80211_reg_rule *reg_rule)
+static uint32_t reg_rule_to_chan_bw_flags(const struct ieee80211_regdomain *regd,
+ const struct ieee80211_reg_rule *reg_rule,
+ const struct ieee80211_channel *chan)
{
-#ifdef CONFIG_CFG80211_REG_DEBUG
- const struct ieee80211_power_rule *power_rule;
- const struct ieee80211_freq_range *freq_range;
- char max_antenna_gain[32], bw[32];
+ const struct ieee80211_freq_range *freq_range = NULL;
+ u32 max_bandwidth_khz, bw_flags = 0;
- power_rule = &reg_rule->power_rule;
freq_range = &reg_rule->freq_range;
- if (!power_rule->max_antenna_gain)
- snprintf(max_antenna_gain, sizeof(max_antenna_gain), "N/A");
- else
- snprintf(max_antenna_gain, sizeof(max_antenna_gain), "%d mBi",
- power_rule->max_antenna_gain);
-
+ max_bandwidth_khz = freq_range->max_bandwidth_khz;
+ /* Check if auto calculation requested */
if (reg_rule->flags & NL80211_RRF_AUTO_BW)
- snprintf(bw, sizeof(bw), "%d KHz, %d KHz AUTO",
- freq_range->max_bandwidth_khz,
- reg_get_max_bandwidth(regd, reg_rule));
- else
- snprintf(bw, sizeof(bw), "%d KHz",
- freq_range->max_bandwidth_khz);
+ max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
- REG_DBG_PRINT("Updating information on frequency %d MHz with regulatory rule:\n",
- chan->center_freq);
+ /* If we get a reg_rule we can assume that at least 5Mhz fit */
+ if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
+ MHZ_TO_KHZ(10)))
+ bw_flags |= IEEE80211_CHAN_NO_10MHZ;
+ if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
+ MHZ_TO_KHZ(20)))
+ bw_flags |= IEEE80211_CHAN_NO_20MHZ;
- REG_DBG_PRINT("(%d KHz - %d KHz @ %s), (%s, %d mBm)\n",
- freq_range->start_freq_khz, freq_range->end_freq_khz,
- bw, max_antenna_gain,
- power_rule->max_eirp);
-#endif
+ if (max_bandwidth_khz < MHZ_TO_KHZ(10))
+ bw_flags |= IEEE80211_CHAN_NO_10MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(20))
+ bw_flags |= IEEE80211_CHAN_NO_20MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(40))
+ bw_flags |= IEEE80211_CHAN_NO_HT40;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(80))
+ bw_flags |= IEEE80211_CHAN_NO_80MHZ;
+ if (max_bandwidth_khz < MHZ_TO_KHZ(160))
+ bw_flags |= IEEE80211_CHAN_NO_160MHZ;
+ return bw_flags;
}
/*
@@ -1178,11 +1170,9 @@ static void handle_channel(struct wiphy *wiphy,
u32 flags, bw_flags = 0;
const struct ieee80211_reg_rule *reg_rule = NULL;
const struct ieee80211_power_rule *power_rule = NULL;
- const struct ieee80211_freq_range *freq_range = NULL;
struct wiphy *request_wiphy = NULL;
struct regulatory_request *lr = get_last_request();
const struct ieee80211_regdomain *regd;
- u32 max_bandwidth_khz;
request_wiphy = wiphy_idx_to_wiphy(lr->wiphy_idx);
@@ -1207,47 +1197,22 @@ static void handle_channel(struct wiphy *wiphy,
if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
request_wiphy && request_wiphy == wiphy &&
request_wiphy->regulatory_flags & REGULATORY_STRICT_REG) {
- REG_DBG_PRINT("Disabling freq %d MHz for good\n",
- chan->center_freq);
+ pr_debug("Disabling freq %d MHz for good\n",
+ chan->center_freq);
chan->orig_flags |= IEEE80211_CHAN_DISABLED;
chan->flags = chan->orig_flags;
} else {
- REG_DBG_PRINT("Disabling freq %d MHz\n",
- chan->center_freq);
+ pr_debug("Disabling freq %d MHz\n",
+ chan->center_freq);
chan->flags |= IEEE80211_CHAN_DISABLED;
}
return;
}
regd = reg_get_regdomain(wiphy);
- chan_reg_rule_print_dbg(regd, chan, reg_rule);
power_rule = &reg_rule->power_rule;
- freq_range = &reg_rule->freq_range;
-
- max_bandwidth_khz = freq_range->max_bandwidth_khz;
- /* Check if auto calculation requested */
- if (reg_rule->flags & NL80211_RRF_AUTO_BW)
- max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
-
- /* If we get a reg_rule we can assume that at least 5Mhz fit */
- if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
- MHZ_TO_KHZ(10)))
- bw_flags |= IEEE80211_CHAN_NO_10MHZ;
- if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
- MHZ_TO_KHZ(20)))
- bw_flags |= IEEE80211_CHAN_NO_20MHZ;
-
- if (max_bandwidth_khz < MHZ_TO_KHZ(10))
- bw_flags |= IEEE80211_CHAN_NO_10MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(20))
- bw_flags |= IEEE80211_CHAN_NO_20MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(40))
- bw_flags |= IEEE80211_CHAN_NO_HT40;
- if (max_bandwidth_khz < MHZ_TO_KHZ(80))
- bw_flags |= IEEE80211_CHAN_NO_80MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(160))
- bw_flags |= IEEE80211_CHAN_NO_160MHZ;
+ bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan);
if (lr->initiator == NL80211_REGDOM_SET_BY_DRIVER &&
request_wiphy && request_wiphy == wiphy &&
@@ -1382,18 +1347,15 @@ static bool ignore_reg_update(struct wiphy *wiphy,
return true;
if (!lr) {
- REG_DBG_PRINT("Ignoring regulatory request set by %s "
- "since last_request is not set\n",
- reg_initiator_name(initiator));
+ pr_debug("Ignoring regulatory request set by %s since last_request is not set\n",
+ reg_initiator_name(initiator));
return true;
}
if (initiator == NL80211_REGDOM_SET_BY_CORE &&
wiphy->regulatory_flags & REGULATORY_CUSTOM_REG) {
- REG_DBG_PRINT("Ignoring regulatory request set by %s "
- "since the driver uses its own custom "
- "regulatory domain\n",
- reg_initiator_name(initiator));
+ pr_debug("Ignoring regulatory request set by %s since the driver uses its own custom regulatory domain\n",
+ reg_initiator_name(initiator));
return true;
}
@@ -1404,10 +1366,8 @@ static bool ignore_reg_update(struct wiphy *wiphy,
if (wiphy_strict_alpha2_regd(wiphy) && !wiphy->regd &&
initiator != NL80211_REGDOM_SET_BY_COUNTRY_IE &&
!is_world_regdom(lr->alpha2)) {
- REG_DBG_PRINT("Ignoring regulatory request set by %s "
- "since the driver requires its own regulatory "
- "domain to be set first\n",
- reg_initiator_name(initiator));
+ pr_debug("Ignoring regulatory request set by %s since the driver requires its own regulatory domain to be set first\n",
+ reg_initiator_name(initiator));
return true;
}
@@ -1688,7 +1648,7 @@ static void reg_check_chans_work(struct work_struct *work)
{
struct cfg80211_registered_device *rdev;
- REG_DBG_PRINT("Verifying active interfaces after reg change\n");
+ pr_debug("Verifying active interfaces after reg change\n");
rtnl_lock();
list_for_each_entry(rdev, &cfg80211_rdev_list, list)
@@ -1760,21 +1720,18 @@ static void handle_channel_custom(struct wiphy *wiphy,
u32 bw_flags = 0;
const struct ieee80211_reg_rule *reg_rule = NULL;
const struct ieee80211_power_rule *power_rule = NULL;
- const struct ieee80211_freq_range *freq_range = NULL;
- u32 max_bandwidth_khz;
u32 bw;
for (bw = MHZ_TO_KHZ(20); bw >= MHZ_TO_KHZ(5); bw = bw / 2) {
- reg_rule = freq_reg_info_regd(wiphy,
- MHZ_TO_KHZ(chan->center_freq),
+ reg_rule = freq_reg_info_regd(MHZ_TO_KHZ(chan->center_freq),
regd, bw);
if (!IS_ERR(reg_rule))
break;
}
if (IS_ERR(reg_rule)) {
- REG_DBG_PRINT("Disabling freq %d MHz as custom regd has no rule that fits it\n",
- chan->center_freq);
+ pr_debug("Disabling freq %d MHz as custom regd has no rule that fits it\n",
+ chan->center_freq);
if (wiphy->regulatory_flags & REGULATORY_WIPHY_SELF_MANAGED) {
chan->flags |= IEEE80211_CHAN_DISABLED;
} else {
@@ -1784,34 +1741,8 @@ static void handle_channel_custom(struct wiphy *wiphy,
return;
}
- chan_reg_rule_print_dbg(regd, chan, reg_rule);
-
power_rule = &reg_rule->power_rule;
- freq_range = &reg_rule->freq_range;
-
- max_bandwidth_khz = freq_range->max_bandwidth_khz;
- /* Check if auto calculation requested */
- if (reg_rule->flags & NL80211_RRF_AUTO_BW)
- max_bandwidth_khz = reg_get_max_bandwidth(regd, reg_rule);
-
- /* If we get a reg_rule we can assume that at least 5Mhz fit */
- if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
- MHZ_TO_KHZ(10)))
- bw_flags |= IEEE80211_CHAN_NO_10MHZ;
- if (!reg_does_bw_fit(freq_range, MHZ_TO_KHZ(chan->center_freq),
- MHZ_TO_KHZ(20)))
- bw_flags |= IEEE80211_CHAN_NO_20MHZ;
-
- if (max_bandwidth_khz < MHZ_TO_KHZ(10))
- bw_flags |= IEEE80211_CHAN_NO_10MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(20))
- bw_flags |= IEEE80211_CHAN_NO_20MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(40))
- bw_flags |= IEEE80211_CHAN_NO_HT40;
- if (max_bandwidth_khz < MHZ_TO_KHZ(80))
- bw_flags |= IEEE80211_CHAN_NO_80MHZ;
- if (max_bandwidth_khz < MHZ_TO_KHZ(160))
- bw_flags |= IEEE80211_CHAN_NO_160MHZ;
+ bw_flags = reg_rule_to_chan_bw_flags(regd, reg_rule, chan);
chan->dfs_state_entered = jiffies;
chan->dfs_state = NL80211_DFS_USABLE;
@@ -2540,7 +2471,7 @@ static void restore_alpha2(char *alpha2, bool reset_user)
if (is_user_regdom_saved()) {
/* Unless we're asked to ignore it and reset it */
if (reset_user) {
- REG_DBG_PRINT("Restoring regulatory settings including user preference\n");
+ pr_debug("Restoring regulatory settings including user preference\n");
user_alpha2[0] = '9';
user_alpha2[1] = '7';
@@ -2550,24 +2481,24 @@ static void restore_alpha2(char *alpha2, bool reset_user)
* back as they were for a full restore.
*/
if (!is_world_regdom(ieee80211_regdom)) {
- REG_DBG_PRINT("Keeping preference on module parameter ieee80211_regdom: %c%c\n",
- ieee80211_regdom[0], ieee80211_regdom[1]);
+ pr_debug("Keeping preference on module parameter ieee80211_regdom: %c%c\n",
+ ieee80211_regdom[0], ieee80211_regdom[1]);
alpha2[0] = ieee80211_regdom[0];
alpha2[1] = ieee80211_regdom[1];
}
} else {
- REG_DBG_PRINT("Restoring regulatory settings while preserving user preference for: %c%c\n",
- user_alpha2[0], user_alpha2[1]);
+ pr_debug("Restoring regulatory settings while preserving user preference for: %c%c\n",
+ user_alpha2[0], user_alpha2[1]);
alpha2[0] = user_alpha2[0];
alpha2[1] = user_alpha2[1];
}
} else if (!is_world_regdom(ieee80211_regdom)) {
- REG_DBG_PRINT("Keeping preference on module parameter ieee80211_regdom: %c%c\n",
- ieee80211_regdom[0], ieee80211_regdom[1]);
+ pr_debug("Keeping preference on module parameter ieee80211_regdom: %c%c\n",
+ ieee80211_regdom[0], ieee80211_regdom[1]);
alpha2[0] = ieee80211_regdom[0];
alpha2[1] = ieee80211_regdom[1];
} else
- REG_DBG_PRINT("Restoring regulatory settings\n");
+ pr_debug("Restoring regulatory settings\n");
}
static void restore_custom_reg_settings(struct wiphy *wiphy)
@@ -2679,14 +2610,14 @@ static void restore_regulatory_settings(bool reset_user)
list_splice_tail_init(&tmp_reg_req_list, &reg_requests_list);
spin_unlock(&reg_requests_lock);
- REG_DBG_PRINT("Kicking the queue\n");
+ pr_debug("Kicking the queue\n");
schedule_work(&reg_work);
}
void regulatory_hint_disconnect(void)
{
- REG_DBG_PRINT("All devices are disconnected, going to restore regulatory settings\n");
+ pr_debug("All devices are disconnected, going to restore regulatory settings\n");
restore_regulatory_settings(false);
}
@@ -2734,10 +2665,10 @@ int regulatory_hint_found_beacon(struct wiphy *wiphy,
if (!reg_beacon)
return -ENOMEM;
- REG_DBG_PRINT("Found new beacon on frequency: %d MHz (Ch %d) on %s\n",
- beacon_chan->center_freq,
- ieee80211_frequency_to_channel(beacon_chan->center_freq),
- wiphy_name(wiphy));
+ pr_debug("Found new beacon on frequency: %d MHz (Ch %d) on %s\n",
+ beacon_chan->center_freq,
+ ieee80211_frequency_to_channel(beacon_chan->center_freq),
+ wiphy_name(wiphy));
memcpy(&reg_beacon->chan, beacon_chan,
sizeof(struct ieee80211_channel));
@@ -2763,7 +2694,7 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd)
const struct ieee80211_power_rule *power_rule = NULL;
char bw[32], cac_time[32];
- pr_info(" (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp), (dfs_cac_time)\n");
+ pr_debug(" (start_freq - end_freq @ bandwidth), (max_antenna_gain, max_eirp), (dfs_cac_time)\n");
for (i = 0; i < rd->n_reg_rules; i++) {
reg_rule = &rd->reg_rules[i];
@@ -2790,7 +2721,7 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd)
* in certain regions
*/
if (power_rule->max_antenna_gain)
- pr_info(" (%d KHz - %d KHz @ %s), (%d mBi, %d mBm), (%s)\n",
+ pr_debug(" (%d KHz - %d KHz @ %s), (%d mBi, %d mBm), (%s)\n",
freq_range->start_freq_khz,
freq_range->end_freq_khz,
bw,
@@ -2798,7 +2729,7 @@ static void print_rd_rules(const struct ieee80211_regdomain *rd)
power_rule->max_eirp,
cac_time);
else
- pr_info(" (%d KHz - %d KHz @ %s), (N/A, %d mBm), (%s)\n",
+ pr_debug(" (%d KHz - %d KHz @ %s), (N/A, %d mBm), (%s)\n",
freq_range->start_freq_khz,
freq_range->end_freq_khz,
bw,
@@ -2816,8 +2747,7 @@ bool reg_supported_dfs_region(enum nl80211_dfs_regions dfs_region)
case NL80211_DFS_JP:
return true;
default:
- REG_DBG_PRINT("Ignoring uknown DFS master region: %d\n",
- dfs_region);
+ pr_debug("Ignoring uknown DFS master region: %d\n", dfs_region);
return false;
}
}
@@ -2831,35 +2761,35 @@ static void print_regdomain(const struct ieee80211_regdomain *rd)
struct cfg80211_registered_device *rdev;
rdev = cfg80211_rdev_by_wiphy_idx(lr->wiphy_idx);
if (rdev) {
- pr_info("Current regulatory domain updated by AP to: %c%c\n",
+ pr_debug("Current regulatory domain updated by AP to: %c%c\n",
rdev->country_ie_alpha2[0],
rdev->country_ie_alpha2[1]);
} else
- pr_info("Current regulatory domain intersected:\n");
+ pr_debug("Current regulatory domain intersected:\n");
} else
- pr_info("Current regulatory domain intersected:\n");
+ pr_debug("Current regulatory domain intersected:\n");
} else if (is_world_regdom(rd->alpha2)) {
- pr_info("World regulatory domain updated:\n");
+ pr_debug("World regulatory domain updated:\n");
} else {
if (is_unknown_alpha2(rd->alpha2))
- pr_info("Regulatory domain changed to driver built-in settings (unknown country)\n");
+ pr_debug("Regulatory domain changed to driver built-in settings (unknown country)\n");
else {
if (reg_request_cell_base(lr))
- pr_info("Regulatory domain changed to country: %c%c by Cell Station\n",
+ pr_debug("Regulatory domain changed to country: %c%c by Cell Station\n",
rd->alpha2[0], rd->alpha2[1]);
else
- pr_info("Regulatory domain changed to country: %c%c\n",
+ pr_debug("Regulatory domain changed to country: %c%c\n",
rd->alpha2[0], rd->alpha2[1]);
}
}
- pr_info(" DFS Master region: %s", reg_dfs_region_str(rd->dfs_region));
+ pr_debug(" DFS Master region: %s", reg_dfs_region_str(rd->dfs_region));
print_rd_rules(rd);
}
static void print_regdomain_info(const struct ieee80211_regdomain *rd)
{
- pr_info("Regulatory domain: %c%c\n", rd->alpha2[0], rd->alpha2[1]);
+ pr_debug("Regulatory domain: %c%c\n", rd->alpha2[0], rd->alpha2[1]);
print_rd_rules(rd);
}
@@ -2880,7 +2810,8 @@ static int reg_set_rd_user(const struct ieee80211_regdomain *rd,
return -EALREADY;
if (!is_valid_rd(rd)) {
- pr_err("Invalid regulatory domain detected:\n");
+ pr_err("Invalid regulatory domain detected: %c%c\n",
+ rd->alpha2[0], rd->alpha2[1]);
print_regdomain_info(rd);
return -EINVAL;
}
@@ -2916,7 +2847,8 @@ static int reg_set_rd_driver(const struct ieee80211_regdomain *rd,
return -EALREADY;
if (!is_valid_rd(rd)) {
- pr_err("Invalid regulatory domain detected:\n");
+ pr_err("Invalid regulatory domain detected: %c%c\n",
+ rd->alpha2[0], rd->alpha2[1]);
print_regdomain_info(rd);
return -EINVAL;
}
@@ -2974,7 +2906,8 @@ static int reg_set_rd_country_ie(const struct ieee80211_regdomain *rd,
*/
if (!is_valid_rd(rd)) {
- pr_err("Invalid regulatory domain detected:\n");
+ pr_err("Invalid regulatory domain detected: %c%c\n",
+ rd->alpha2[0], rd->alpha2[1]);
print_regdomain_info(rd);
return -EINVAL;
}
@@ -3029,6 +2962,7 @@ int set_regdom(const struct ieee80211_regdomain *rd,
break;
default:
WARN(1, "invalid initiator %d\n", lr->initiator);
+ kfree(rd);
return -EINVAL;
}
@@ -3221,8 +3155,10 @@ int __init regulatory_init(void)
/* We always try to get an update for the static regdomain */
err = regulatory_hint_core(cfg80211_world_regdom->alpha2);
if (err) {
- if (err == -ENOMEM)
+ if (err == -ENOMEM) {
+ platform_device_unregister(reg_pdev);
return err;
+ }
/*
* N.B. kobject_uevent_env() can fail mainly for when we're out
* memory which is handled and propagated appropriately above
diff --git a/net/wireless/sme.c b/net/wireless/sme.c
index 8020b5b094d4..544558171787 100644
--- a/net/wireless/sme.c
+++ b/net/wireless/sme.c
@@ -264,7 +264,7 @@ static struct cfg80211_bss *cfg80211_get_conn_bss(struct wireless_dev *wdev)
wdev->conn->params.bssid,
wdev->conn->params.ssid,
wdev->conn->params.ssid_len,
- IEEE80211_BSS_TYPE_ESS,
+ wdev->conn_bss_type,
IEEE80211_PRIVACY(wdev->conn->params.privacy));
if (!bss)
return NULL;
@@ -687,7 +687,7 @@ void __cfg80211_connect_result(struct net_device *dev, const u8 *bssid,
WARN_ON_ONCE(!wiphy_to_rdev(wdev->wiphy)->ops->connect);
bss = cfg80211_get_bss(wdev->wiphy, NULL, bssid,
wdev->ssid, wdev->ssid_len,
- IEEE80211_BSS_TYPE_ESS,
+ wdev->conn_bss_type,
IEEE80211_PRIVACY_ANY);
if (bss)
cfg80211_hold_bss(bss_from_pub(bss));
@@ -846,7 +846,7 @@ void cfg80211_roamed(struct net_device *dev,
bss = cfg80211_get_bss(wdev->wiphy, channel, bssid, wdev->ssid,
wdev->ssid_len,
- IEEE80211_BSS_TYPE_ESS, IEEE80211_PRIVACY_ANY);
+ wdev->conn_bss_type, IEEE80211_PRIVACY_ANY);
if (WARN_ON(!bss))
return;
@@ -917,6 +917,12 @@ void __cfg80211_disconnected(struct net_device *dev, const u8 *ie,
nl80211_send_disconnected(rdev, dev, reason, ie, ie_len, from_ap);
+ /* stop critical protocol if supported */
+ if (rdev->ops->crit_proto_stop && rdev->crit_proto_nlportid) {
+ rdev->crit_proto_nlportid = 0;
+ rdev_crit_proto_stop(rdev, wdev);
+ }
+
/*
* Delete all the keys ... pairwise keys can't really
* exist any more anyway, but default keys might.
@@ -1017,6 +1023,9 @@ int cfg80211_connect(struct cfg80211_registered_device *rdev,
memcpy(wdev->ssid, connect->ssid, connect->ssid_len);
wdev->ssid_len = connect->ssid_len;
+ wdev->conn_bss_type = connect->pbss ? IEEE80211_BSS_TYPE_PBSS :
+ IEEE80211_BSS_TYPE_ESS;
+
if (!rdev->ops->connect)
err = cfg80211_sme_connect(wdev, connect, prev_bssid);
else
diff --git a/net/wireless/trace.h b/net/wireless/trace.h
index 0c392d36781b..09b242b09bed 100644
--- a/net/wireless/trace.h
+++ b/net/wireless/trace.h
@@ -623,12 +623,24 @@ DECLARE_EVENT_CLASS(station_add_change,
__field(u32, sta_flags_set)
__field(u32, sta_modify_mask)
__field(int, listen_interval)
+ __field(u16, capability)
__field(u16, aid)
__field(u8, plink_action)
__field(u8, plink_state)
__field(u8, uapsd_queues)
+ __field(u8, max_sp)
+ __field(u8, opmode_notif)
+ __field(bool, opmode_notif_used)
__array(u8, ht_capa, (int)sizeof(struct ieee80211_ht_cap))
+ __array(u8, vht_capa, (int)sizeof(struct ieee80211_vht_cap))
__array(char, vlan, IFNAMSIZ)
+ __dynamic_array(u8, supported_rates,
+ params->supported_rates_len)
+ __dynamic_array(u8, ext_capab, params->ext_capab_len)
+ __dynamic_array(u8, supported_channels,
+ params->supported_channels_len)
+ __dynamic_array(u8, supported_oper_classes,
+ params->supported_oper_classes_len)
),
TP_fast_assign(
WIPHY_ASSIGN;
@@ -646,9 +658,35 @@ DECLARE_EVENT_CLASS(station_add_change,
if (params->ht_capa)
memcpy(__entry->ht_capa, params->ht_capa,
sizeof(struct ieee80211_ht_cap));
+ memset(__entry->vht_capa, 0, sizeof(struct ieee80211_vht_cap));
+ if (params->vht_capa)
+ memcpy(__entry->vht_capa, params->vht_capa,
+ sizeof(struct ieee80211_vht_cap));
memset(__entry->vlan, 0, sizeof(__entry->vlan));
if (params->vlan)
memcpy(__entry->vlan, params->vlan->name, IFNAMSIZ);
+ if (params->supported_rates && params->supported_rates_len)
+ memcpy(__get_dynamic_array(supported_rates),
+ params->supported_rates,
+ params->supported_rates_len);
+ if (params->ext_capab && params->ext_capab_len)
+ memcpy(__get_dynamic_array(ext_capab),
+ params->ext_capab,
+ params->ext_capab_len);
+ if (params->supported_channels &&
+ params->supported_channels_len)
+ memcpy(__get_dynamic_array(supported_channels),
+ params->supported_channels,
+ params->supported_channels_len);
+ if (params->supported_oper_classes &&
+ params->supported_oper_classes_len)
+ memcpy(__get_dynamic_array(supported_oper_classes),
+ params->supported_oper_classes,
+ params->supported_oper_classes_len);
+ __entry->max_sp = params->max_sp;
+ __entry->capability = params->capability;
+ __entry->opmode_notif = params->opmode_notif;
+ __entry->opmode_notif_used = params->opmode_notif_used;
),
TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", station mac: " MAC_PR_FMT
", station flags mask: %u, station flags set: %u, "
@@ -2818,6 +2856,71 @@ TRACE_EVENT(cfg80211_stop_iface,
WIPHY_PR_ARG, WDEV_PR_ARG)
);
+TRACE_EVENT(rdev_start_radar_detection,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ struct cfg80211_chan_def *chandef,
+ u32 cac_time_ms),
+ TP_ARGS(wiphy, netdev, chandef, cac_time_ms),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ CHAN_DEF_ENTRY
+ __field(u32, cac_time_ms)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ CHAN_DEF_ASSIGN(chandef);
+ __entry->cac_time_ms = cac_time_ms;
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", " CHAN_DEF_PR_FMT
+ ", cac_time_ms=%u",
+ WIPHY_PR_ARG, NETDEV_PR_ARG, CHAN_DEF_PR_ARG,
+ __entry->cac_time_ms)
+);
+
+TRACE_EVENT(rdev_set_mcast_rate,
+ TP_PROTO(struct wiphy *wiphy, struct net_device *netdev,
+ int mcast_rate[IEEE80211_NUM_BANDS]),
+ TP_ARGS(wiphy, netdev, mcast_rate),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ NETDEV_ENTRY
+ __array(int, mcast_rate, IEEE80211_NUM_BANDS)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ NETDEV_ASSIGN;
+ memcpy(__entry->mcast_rate, mcast_rate,
+ sizeof(int) * IEEE80211_NUM_BANDS);
+ ),
+ TP_printk(WIPHY_PR_FMT ", " NETDEV_PR_FMT ", "
+ "mcast_rates [2.4GHz=0x%x, 5.2GHz=0x%x, 60GHz=0x%x]",
+ WIPHY_PR_ARG, NETDEV_PR_ARG,
+ __entry->mcast_rate[IEEE80211_BAND_2GHZ],
+ __entry->mcast_rate[IEEE80211_BAND_5GHZ],
+ __entry->mcast_rate[IEEE80211_BAND_60GHZ])
+);
+
+TRACE_EVENT(rdev_set_coalesce,
+ TP_PROTO(struct wiphy *wiphy, struct cfg80211_coalesce *coalesce),
+ TP_ARGS(wiphy, coalesce),
+ TP_STRUCT__entry(
+ WIPHY_ENTRY
+ __field(int, n_rules)
+ ),
+ TP_fast_assign(
+ WIPHY_ASSIGN;
+ __entry->n_rules = coalesce ? coalesce->n_rules : 0;
+ ),
+ TP_printk(WIPHY_PR_FMT ", n_rules=%d",
+ WIPHY_PR_ARG, __entry->n_rules)
+);
+
+DEFINE_EVENT(wiphy_wdev_evt, rdev_abort_scan,
+ TP_PROTO(struct wiphy *wiphy, struct wireless_dev *wdev),
+ TP_ARGS(wiphy, wdev)
+);
#endif /* !__RDEV_OPS_TRACE || TRACE_HEADER_MULTI_READ */
#undef TRACE_INCLUDE_PATH
diff --git a/net/wireless/util.c b/net/wireless/util.c
index baf7218cec15..9f440a9de63b 100644
--- a/net/wireless/util.c
+++ b/net/wireless/util.c
@@ -393,9 +393,9 @@ unsigned int ieee80211_get_hdrlen_from_skb(const struct sk_buff *skb)
}
EXPORT_SYMBOL(ieee80211_get_hdrlen_from_skb);
-unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr)
+static unsigned int __ieee80211_get_mesh_hdrlen(u8 flags)
{
- int ae = meshhdr->flags & MESH_FLAGS_AE;
+ int ae = flags & MESH_FLAGS_AE;
/* 802.11-2012, 8.2.4.7.3 */
switch (ae) {
default:
@@ -407,21 +407,31 @@ unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr)
return 18;
}
}
+
+unsigned int ieee80211_get_mesh_hdrlen(struct ieee80211s_hdr *meshhdr)
+{
+ return __ieee80211_get_mesh_hdrlen(meshhdr->flags);
+}
EXPORT_SYMBOL(ieee80211_get_mesh_hdrlen);
-int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
- enum nl80211_iftype iftype)
+static int __ieee80211_data_to_8023(struct sk_buff *skb, struct ethhdr *ehdr,
+ const u8 *addr, enum nl80211_iftype iftype)
{
struct ieee80211_hdr *hdr = (struct ieee80211_hdr *) skb->data;
- u16 hdrlen, ethertype;
- u8 *payload;
- u8 dst[ETH_ALEN];
- u8 src[ETH_ALEN] __aligned(2);
+ struct {
+ u8 hdr[ETH_ALEN] __aligned(2);
+ __be16 proto;
+ } payload;
+ struct ethhdr tmp;
+ u16 hdrlen;
+ u8 mesh_flags = 0;
if (unlikely(!ieee80211_is_data_present(hdr->frame_control)))
return -1;
hdrlen = ieee80211_hdrlen(hdr->frame_control);
+ if (skb->len < hdrlen + 8)
+ return -1;
/* convert IEEE 802.11 header + possible LLC headers into Ethernet
* header
@@ -432,8 +442,11 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
* 1 0 BSSID SA DA n/a
* 1 1 RA TA DA SA
*/
- memcpy(dst, ieee80211_get_DA(hdr), ETH_ALEN);
- memcpy(src, ieee80211_get_SA(hdr), ETH_ALEN);
+ memcpy(tmp.h_dest, ieee80211_get_DA(hdr), ETH_ALEN);
+ memcpy(tmp.h_source, ieee80211_get_SA(hdr), ETH_ALEN);
+
+ if (iftype == NL80211_IFTYPE_MESH_POINT)
+ skb_copy_bits(skb, hdrlen, &mesh_flags, 1);
switch (hdr->frame_control &
cpu_to_le16(IEEE80211_FCTL_TODS | IEEE80211_FCTL_FROMDS)) {
@@ -450,44 +463,31 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
iftype != NL80211_IFTYPE_STATION))
return -1;
if (iftype == NL80211_IFTYPE_MESH_POINT) {
- struct ieee80211s_hdr *meshdr =
- (struct ieee80211s_hdr *) (skb->data + hdrlen);
- /* make sure meshdr->flags is on the linear part */
- if (!pskb_may_pull(skb, hdrlen + 1))
- return -1;
- if (meshdr->flags & MESH_FLAGS_AE_A4)
+ if (mesh_flags & MESH_FLAGS_AE_A4)
return -1;
- if (meshdr->flags & MESH_FLAGS_AE_A5_A6) {
+ if (mesh_flags & MESH_FLAGS_AE_A5_A6) {
skb_copy_bits(skb, hdrlen +
offsetof(struct ieee80211s_hdr, eaddr1),
- dst, ETH_ALEN);
- skb_copy_bits(skb, hdrlen +
- offsetof(struct ieee80211s_hdr, eaddr2),
- src, ETH_ALEN);
+ tmp.h_dest, 2 * ETH_ALEN);
}
- hdrlen += ieee80211_get_mesh_hdrlen(meshdr);
+ hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags);
}
break;
case cpu_to_le16(IEEE80211_FCTL_FROMDS):
if ((iftype != NL80211_IFTYPE_STATION &&
iftype != NL80211_IFTYPE_P2P_CLIENT &&
iftype != NL80211_IFTYPE_MESH_POINT) ||
- (is_multicast_ether_addr(dst) &&
- ether_addr_equal(src, addr)))
+ (is_multicast_ether_addr(tmp.h_dest) &&
+ ether_addr_equal(tmp.h_source, addr)))
return -1;
if (iftype == NL80211_IFTYPE_MESH_POINT) {
- struct ieee80211s_hdr *meshdr =
- (struct ieee80211s_hdr *) (skb->data + hdrlen);
- /* make sure meshdr->flags is on the linear part */
- if (!pskb_may_pull(skb, hdrlen + 1))
- return -1;
- if (meshdr->flags & MESH_FLAGS_AE_A5_A6)
+ if (mesh_flags & MESH_FLAGS_AE_A5_A6)
return -1;
- if (meshdr->flags & MESH_FLAGS_AE_A4)
+ if (mesh_flags & MESH_FLAGS_AE_A4)
skb_copy_bits(skb, hdrlen +
offsetof(struct ieee80211s_hdr, eaddr1),
- src, ETH_ALEN);
- hdrlen += ieee80211_get_mesh_hdrlen(meshdr);
+ tmp.h_source, ETH_ALEN);
+ hdrlen += __ieee80211_get_mesh_hdrlen(mesh_flags);
}
break;
case cpu_to_le16(0):
@@ -498,33 +498,33 @@ int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
break;
}
- if (!pskb_may_pull(skb, hdrlen + 8))
- return -1;
-
- payload = skb->data + hdrlen;
- ethertype = (payload[6] << 8) | payload[7];
+ skb_copy_bits(skb, hdrlen, &payload, sizeof(payload));
+ tmp.h_proto = payload.proto;
- if (likely((ether_addr_equal(payload, rfc1042_header) &&
- ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) ||
- ether_addr_equal(payload, bridge_tunnel_header))) {
+ if (likely((ether_addr_equal(payload.hdr, rfc1042_header) &&
+ tmp.h_proto != htons(ETH_P_AARP) &&
+ tmp.h_proto != htons(ETH_P_IPX)) ||
+ ether_addr_equal(payload.hdr, bridge_tunnel_header)))
/* remove RFC1042 or Bridge-Tunnel encapsulation and
* replace EtherType */
- skb_pull(skb, hdrlen + 6);
- memcpy(skb_push(skb, ETH_ALEN), src, ETH_ALEN);
- memcpy(skb_push(skb, ETH_ALEN), dst, ETH_ALEN);
- } else {
- struct ethhdr *ehdr;
- __be16 len;
+ hdrlen += ETH_ALEN + 2;
+ else
+ tmp.h_proto = htons(skb->len);
+
+ pskb_pull(skb, hdrlen);
- skb_pull(skb, hdrlen);
- len = htons(skb->len);
+ if (!ehdr)
ehdr = (struct ethhdr *) skb_push(skb, sizeof(struct ethhdr));
- memcpy(ehdr->h_dest, dst, ETH_ALEN);
- memcpy(ehdr->h_source, src, ETH_ALEN);
- ehdr->h_proto = len;
- }
+ memcpy(ehdr, &tmp, sizeof(tmp));
+
return 0;
}
+
+int ieee80211_data_to_8023(struct sk_buff *skb, const u8 *addr,
+ enum nl80211_iftype iftype)
+{
+ return __ieee80211_data_to_8023(skb, NULL, addr, iftype);
+}
EXPORT_SYMBOL(ieee80211_data_to_8023);
int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
@@ -636,7 +636,7 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
/* Update skb pointers to various headers since this modified frame
* is going to go through Linux networking code that may potentially
* need things like pointer to IP header. */
- skb_set_mac_header(skb, 0);
+ skb_reset_mac_header(skb);
skb_set_network_header(skb, nh_pos);
skb_set_transport_header(skb, h_pos);
@@ -644,70 +644,147 @@ int ieee80211_data_from_8023(struct sk_buff *skb, const u8 *addr,
}
EXPORT_SYMBOL(ieee80211_data_from_8023);
+static void
+__frame_add_frag(struct sk_buff *skb, struct page *page,
+ void *ptr, int len, int size)
+{
+ struct skb_shared_info *sh = skb_shinfo(skb);
+ int page_offset;
+
+ atomic_inc(&page->_count);
+ page_offset = ptr - page_address(page);
+ skb_add_rx_frag(skb, sh->nr_frags, page, page_offset, len, size);
+}
+
+static void
+__ieee80211_amsdu_copy_frag(struct sk_buff *skb, struct sk_buff *frame,
+ int offset, int len)
+{
+ struct skb_shared_info *sh = skb_shinfo(skb);
+ const skb_frag_t *frag = &sh->frags[-1];
+ struct page *frag_page;
+ void *frag_ptr;
+ int frag_len, frag_size;
+ int head_size = skb->len - skb->data_len;
+ int cur_len;
+
+ frag_page = virt_to_head_page(skb->head);
+ frag_ptr = skb->data;
+ frag_size = head_size;
+
+ while (offset >= frag_size) {
+ offset -= frag_size;
+ frag++;
+ frag_page = skb_frag_page(frag);
+ frag_ptr = skb_frag_address(frag);
+ frag_size = skb_frag_size(frag);
+ }
+
+ frag_ptr += offset;
+ frag_len = frag_size - offset;
+
+ cur_len = min(len, frag_len);
+
+ __frame_add_frag(frame, frag_page, frag_ptr, cur_len, frag_size);
+ len -= cur_len;
+
+ while (len > 0) {
+ frag++;
+ frag_len = skb_frag_size(frag);
+ cur_len = min(len, frag_len);
+ __frame_add_frag(frame, skb_frag_page(frag),
+ skb_frag_address(frag), cur_len, frag_len);
+ len -= cur_len;
+ }
+}
+
+static struct sk_buff *
+__ieee80211_amsdu_copy(struct sk_buff *skb, unsigned int hlen,
+ int offset, int len, bool reuse_frag)
+{
+ struct sk_buff *frame;
+ int cur_len = len;
+
+ if (skb->len - offset < len)
+ return NULL;
+
+ /*
+ * When reusing framents, copy some data to the head to simplify
+ * ethernet header handling and speed up protocol header processing
+ * in the stack later.
+ */
+ if (reuse_frag)
+ cur_len = min_t(int, len, 32);
+
+ /*
+ * Allocate and reserve two bytes more for payload
+ * alignment since sizeof(struct ethhdr) is 14.
+ */
+ frame = dev_alloc_skb(hlen + sizeof(struct ethhdr) + 2 + cur_len);
+
+ skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2);
+ skb_copy_bits(skb, offset, skb_put(frame, cur_len), cur_len);
+
+ len -= cur_len;
+ if (!len)
+ return frame;
+
+ offset += cur_len;
+ __ieee80211_amsdu_copy_frag(skb, frame, offset, len);
+
+ return frame;
+}
void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
const u8 *addr, enum nl80211_iftype iftype,
const unsigned int extra_headroom,
bool has_80211_header)
{
+ unsigned int hlen = ALIGN(extra_headroom, 4);
struct sk_buff *frame = NULL;
u16 ethertype;
u8 *payload;
- const struct ethhdr *eth;
- int remaining, err;
- u8 dst[ETH_ALEN], src[ETH_ALEN];
+ int offset = 0, remaining, err;
+ struct ethhdr eth;
+ bool reuse_frag = skb->head_frag && !skb_has_frag_list(skb);
+ bool reuse_skb = false;
+ bool last = false;
if (has_80211_header) {
- err = ieee80211_data_to_8023(skb, addr, iftype);
+ err = __ieee80211_data_to_8023(skb, &eth, addr, iftype);
if (err)
goto out;
-
- /* skip the wrapping header */
- eth = (struct ethhdr *) skb_pull(skb, sizeof(struct ethhdr));
- if (!eth)
- goto out;
- } else {
- eth = (struct ethhdr *) skb->data;
}
- while (skb != frame) {
+ while (!last) {
+ unsigned int subframe_len;
+ int len;
u8 padding;
- __be16 len = eth->h_proto;
- unsigned int subframe_len = sizeof(struct ethhdr) + ntohs(len);
-
- remaining = skb->len;
- memcpy(dst, eth->h_dest, ETH_ALEN);
- memcpy(src, eth->h_source, ETH_ALEN);
+ skb_copy_bits(skb, offset, &eth, sizeof(eth));
+ len = ntohs(eth.h_proto);
+ subframe_len = sizeof(struct ethhdr) + len;
padding = (4 - subframe_len) & 0x3;
+
/* the last MSDU has no padding */
+ remaining = skb->len - offset;
if (subframe_len > remaining)
goto purge;
- skb_pull(skb, sizeof(struct ethhdr));
+ offset += sizeof(struct ethhdr);
/* reuse skb for the last subframe */
- if (remaining <= subframe_len + padding)
+ last = remaining <= subframe_len + padding;
+ if (!skb_is_nonlinear(skb) && !reuse_frag && last) {
+ skb_pull(skb, offset);
frame = skb;
- else {
- unsigned int hlen = ALIGN(extra_headroom, 4);
- /*
- * Allocate and reserve two bytes more for payload
- * alignment since sizeof(struct ethhdr) is 14.
- */
- frame = dev_alloc_skb(hlen + subframe_len + 2);
+ reuse_skb = true;
+ } else {
+ frame = __ieee80211_amsdu_copy(skb, hlen, offset, len,
+ reuse_frag);
if (!frame)
goto purge;
- skb_reserve(frame, hlen + sizeof(struct ethhdr) + 2);
- memcpy(skb_put(frame, ntohs(len)), skb->data,
- ntohs(len));
-
- eth = (struct ethhdr *)skb_pull(skb, ntohs(len) +
- padding);
- if (!eth) {
- dev_kfree_skb(frame);
- goto purge;
- }
+ offset += len + padding;
}
skb_reset_network_header(frame);
@@ -716,24 +793,20 @@ void ieee80211_amsdu_to_8023s(struct sk_buff *skb, struct sk_buff_head *list,
payload = frame->data;
ethertype = (payload[6] << 8) | payload[7];
-
if (likely((ether_addr_equal(payload, rfc1042_header) &&
ethertype != ETH_P_AARP && ethertype != ETH_P_IPX) ||
ether_addr_equal(payload, bridge_tunnel_header))) {
- /* remove RFC1042 or Bridge-Tunnel
- * encapsulation and replace EtherType */
- skb_pull(frame, 6);
- memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN);
- memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN);
- } else {
- memcpy(skb_push(frame, sizeof(__be16)), &len,
- sizeof(__be16));
- memcpy(skb_push(frame, ETH_ALEN), src, ETH_ALEN);
- memcpy(skb_push(frame, ETH_ALEN), dst, ETH_ALEN);
+ eth.h_proto = htons(ethertype);
+ skb_pull(frame, ETH_ALEN + 2);
}
+
+ memcpy(skb_push(frame, sizeof(eth)), &eth, sizeof(eth));
__skb_queue_tail(list, frame);
}
+ if (!reuse_skb)
+ dev_kfree_skb(skb);
+
return;
purge:
@@ -1325,13 +1398,6 @@ size_t ieee80211_ie_split_ric(const u8 *ies, size_t ielen,
}
EXPORT_SYMBOL(ieee80211_ie_split_ric);
-size_t ieee80211_ie_split(const u8 *ies, size_t ielen,
- const u8 *ids, int n_ids, size_t offset)
-{
- return ieee80211_ie_split_ric(ies, ielen, ids, n_ids, NULL, 0, offset);
-}
-EXPORT_SYMBOL(ieee80211_ie_split);
-
bool ieee80211_operating_class_to_band(u8 operating_class,
enum ieee80211_band *band)
{
@@ -1620,120 +1686,6 @@ int cfg80211_check_combinations(struct wiphy *wiphy,
}
EXPORT_SYMBOL(cfg80211_check_combinations);
-int cfg80211_can_use_iftype_chan(struct cfg80211_registered_device *rdev,
- struct wireless_dev *wdev,
- enum nl80211_iftype iftype,
- struct ieee80211_channel *chan,
- enum cfg80211_chan_mode chanmode,
- u8 radar_detect)
-{
- struct wireless_dev *wdev_iter;
- int num[NUM_NL80211_IFTYPES];
- struct ieee80211_channel
- *used_channels[CFG80211_MAX_NUM_DIFFERENT_CHANNELS];
- struct ieee80211_channel *ch;
- enum cfg80211_chan_mode chmode;
- int num_different_channels = 0;
- int total = 1;
- int i;
-
- ASSERT_RTNL();
-
- if (WARN_ON(hweight32(radar_detect) > 1))
- return -EINVAL;
-
- if (WARN_ON(iftype >= NUM_NL80211_IFTYPES))
- return -EINVAL;
-
- /* Always allow software iftypes */
- if (rdev->wiphy.software_iftypes & BIT(iftype)) {
- if (radar_detect)
- return -EINVAL;
- return 0;
- }
-
- memset(num, 0, sizeof(num));
- memset(used_channels, 0, sizeof(used_channels));
-
- num[iftype] = 1;
-
- /* TODO: We'll probably not need this anymore, since this
- * should only be called with CHAN_MODE_UNDEFINED. There are
- * still a couple of pending calls where other chanmodes are
- * used, but we should get rid of them.
- */
- switch (chanmode) {
- case CHAN_MODE_UNDEFINED:
- break;
- case CHAN_MODE_SHARED:
- WARN_ON(!chan);
- used_channels[0] = chan;
- num_different_channels++;
- break;
- case CHAN_MODE_EXCLUSIVE:
- num_different_channels++;
- break;
- }
-
- list_for_each_entry(wdev_iter, &rdev->wdev_list, list) {
- if (wdev_iter == wdev)
- continue;
- if (wdev_iter->iftype == NL80211_IFTYPE_P2P_DEVICE) {
- if (!wdev_iter->p2p_started)
- continue;
- } else if (wdev_iter->netdev) {
- if (!netif_running(wdev_iter->netdev))
- continue;
- } else {
- WARN_ON(1);
- }
-
- if (rdev->wiphy.software_iftypes & BIT(wdev_iter->iftype))
- continue;
-
- /*
- * We may be holding the "wdev" mutex, but now need to lock
- * wdev_iter. This is OK because once we get here wdev_iter
- * is not wdev (tested above), but we need to use the nested
- * locking for lockdep.
- */
- mutex_lock_nested(&wdev_iter->mtx, 1);
- __acquire(wdev_iter->mtx);
- cfg80211_get_chan_state(wdev_iter, &ch, &chmode, &radar_detect);
- wdev_unlock(wdev_iter);
-
- switch (chmode) {
- case CHAN_MODE_UNDEFINED:
- break;
- case CHAN_MODE_SHARED:
- for (i = 0; i < CFG80211_MAX_NUM_DIFFERENT_CHANNELS; i++)
- if (!used_channels[i] || used_channels[i] == ch)
- break;
-
- if (i == CFG80211_MAX_NUM_DIFFERENT_CHANNELS)
- return -EBUSY;
-
- if (used_channels[i] == NULL) {
- used_channels[i] = ch;
- num_different_channels++;
- }
- break;
- case CHAN_MODE_EXCLUSIVE:
- num_different_channels++;
- break;
- }
-
- num[wdev_iter->iftype]++;
- total++;
- }
-
- if (total == 1 && !radar_detect)
- return 0;
-
- return cfg80211_check_combinations(&rdev->wiphy, num_different_channels,
- radar_detect, num);
-}
-
int ieee80211_get_ratemask(struct ieee80211_supported_band *sband,
const u8 *rates, unsigned int n_rates,
u32 *mask)
diff --git a/net/wireless/wext-core.c b/net/wireless/wext-core.c
index c8717c1d082e..b50ee5d622e1 100644
--- a/net/wireless/wext-core.c
+++ b/net/wireless/wext-core.c
@@ -342,6 +342,40 @@ static const int compat_event_type_size[] = {
/* IW event code */
+void wireless_nlevent_flush(void)
+{
+ struct sk_buff *skb;
+ struct net *net;
+
+ ASSERT_RTNL();
+
+ for_each_net(net) {
+ while ((skb = skb_dequeue(&net->wext_nlevents)))
+ rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
+ GFP_KERNEL);
+ }
+}
+EXPORT_SYMBOL_GPL(wireless_nlevent_flush);
+
+static int wext_netdev_notifier_call(struct notifier_block *nb,
+ unsigned long state, void *ptr)
+{
+ /*
+ * When a netdev changes state in any way, flush all pending messages
+ * to avoid them going out in a strange order, e.g. RTM_NEWLINK after
+ * RTM_DELLINK, or with IFF_UP after without IFF_UP during dev_close()
+ * or similar - all of which could otherwise happen due to delays from
+ * schedule_work().
+ */
+ wireless_nlevent_flush();
+
+ return NOTIFY_OK;
+}
+
+static struct notifier_block wext_netdev_notifier = {
+ .notifier_call = wext_netdev_notifier_call,
+};
+
static int __net_init wext_pernet_init(struct net *net)
{
skb_queue_head_init(&net->wext_nlevents);
@@ -360,7 +394,12 @@ static struct pernet_operations wext_pernet_ops = {
static int __init wireless_nlevent_init(void)
{
- return register_pernet_subsys(&wext_pernet_ops);
+ int err = register_pernet_subsys(&wext_pernet_ops);
+
+ if (err)
+ return err;
+
+ return register_netdevice_notifier(&wext_netdev_notifier);
}
subsys_initcall(wireless_nlevent_init);
@@ -368,17 +407,8 @@ subsys_initcall(wireless_nlevent_init);
/* Process events generated by the wireless layer or the driver. */
static void wireless_nlevent_process(struct work_struct *work)
{
- struct sk_buff *skb;
- struct net *net;
-
rtnl_lock();
-
- for_each_net(net) {
- while ((skb = skb_dequeue(&net->wext_nlevents)))
- rtnl_notify(skb, net, 0, RTNLGRP_LINK, NULL,
- GFP_KERNEL);
- }
-
+ wireless_nlevent_flush();
rtnl_unlock();
}
diff --git a/net/xfrm/xfrm_algo.c b/net/xfrm/xfrm_algo.c
index f07224d8b88f..250e567ba3d6 100644
--- a/net/xfrm/xfrm_algo.c
+++ b/net/xfrm/xfrm_algo.c
@@ -9,6 +9,8 @@
* any later version.
*/
+#include <crypto/hash.h>
+#include <crypto/skcipher.h>
#include <linux/module.h>
#include <linux/kernel.h>
#include <linux/pfkeyv2.h>
@@ -782,14 +784,13 @@ void xfrm_probe_algs(void)
BUG_ON(in_softirq());
for (i = 0; i < aalg_entries(); i++) {
- status = crypto_has_hash(aalg_list[i].name, 0,
- CRYPTO_ALG_ASYNC);
+ status = crypto_has_ahash(aalg_list[i].name, 0, 0);
if (aalg_list[i].available != status)
aalg_list[i].available = status;
}
for (i = 0; i < ealg_entries(); i++) {
- status = crypto_has_ablkcipher(ealg_list[i].name, 0, 0);
+ status = crypto_has_skcipher(ealg_list[i].name, 0, 0);
if (ealg_list[i].available != status)
ealg_list[i].available = status;
}
diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index ad7f5b3f9b61..1c4ad477ce93 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -292,12 +292,15 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type)
XFRM_SKB_CB(skb)->seq.input.hi = seq_hi;
skb_dst_force(skb);
+ dev_hold(skb->dev);
nexthdr = x->type->input(x, skb);
if (nexthdr == -EINPROGRESS)
return 0;
resume:
+ dev_put(skb->dev);
+
spin_lock(&x->lock);
if (nexthdr <= 0) {
if (nexthdr == -EBADMSG) {
diff --git a/net/xfrm/xfrm_output.c b/net/xfrm/xfrm_output.c
index cc3676eb6239..ff4a91fcab9f 100644
--- a/net/xfrm/xfrm_output.c
+++ b/net/xfrm/xfrm_output.c
@@ -167,6 +167,8 @@ static int xfrm_output_gso(struct net *net, struct sock *sk, struct sk_buff *skb
{
struct sk_buff *segs;
+ BUILD_BUG_ON(sizeof(*IPCB(skb)) > SKB_SGO_CB_OFFSET);
+ BUILD_BUG_ON(sizeof(*IP6CB(skb)) > SKB_SGO_CB_OFFSET);
segs = skb_gso_segment(skb, 0);
kfree_skb(skb);
if (IS_ERR(segs))
diff --git a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c
index 09bfcbac63bb..b5e665b3cfb0 100644
--- a/net/xfrm/xfrm_policy.c
+++ b/net/xfrm/xfrm_policy.c
@@ -303,6 +303,14 @@ struct xfrm_policy *xfrm_policy_alloc(struct net *net, gfp_t gfp)
}
EXPORT_SYMBOL(xfrm_policy_alloc);
+static void xfrm_policy_destroy_rcu(struct rcu_head *head)
+{
+ struct xfrm_policy *policy = container_of(head, struct xfrm_policy, rcu);
+
+ security_xfrm_policy_free(policy->security);
+ kfree(policy);
+}
+
/* Destroy xfrm_policy: descendant resources must be released to this moment. */
void xfrm_policy_destroy(struct xfrm_policy *policy)
@@ -312,8 +320,7 @@ void xfrm_policy_destroy(struct xfrm_policy *policy)
if (del_timer(&policy->timer) || del_timer(&policy->polq.hold_timer))
BUG();
- security_xfrm_policy_free(policy->security);
- kfree(policy);
+ call_rcu(&policy->rcu, xfrm_policy_destroy_rcu);
}
EXPORT_SYMBOL(xfrm_policy_destroy);
@@ -1214,8 +1221,10 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
struct xfrm_policy *pol;
struct net *net = sock_net(sk);
+ rcu_read_lock();
read_lock_bh(&net->xfrm.xfrm_policy_lock);
- if ((pol = sk->sk_policy[dir]) != NULL) {
+ pol = rcu_dereference(sk->sk_policy[dir]);
+ if (pol != NULL) {
bool match = xfrm_selector_match(&pol->selector, fl,
sk->sk_family);
int err = 0;
@@ -1239,6 +1248,7 @@ static struct xfrm_policy *xfrm_sk_policy_lookup(const struct sock *sk, int dir,
}
out:
read_unlock_bh(&net->xfrm.xfrm_policy_lock);
+ rcu_read_unlock();
return pol;
}
@@ -1307,13 +1317,14 @@ int xfrm_sk_policy_insert(struct sock *sk, int dir, struct xfrm_policy *pol)
#endif
write_lock_bh(&net->xfrm.xfrm_policy_lock);
- old_pol = sk->sk_policy[dir];
- sk->sk_policy[dir] = pol;
+ old_pol = rcu_dereference_protected(sk->sk_policy[dir],
+ lockdep_is_held(&net->xfrm.xfrm_policy_lock));
if (pol) {
pol->curlft.add_time = get_seconds();
pol->index = xfrm_gen_index(net, XFRM_POLICY_MAX+dir, 0);
xfrm_sk_policy_link(pol, dir);
}
+ rcu_assign_pointer(sk->sk_policy[dir], pol);
if (old_pol) {
if (pol)
xfrm_policy_requeue(old_pol, pol);
@@ -1361,17 +1372,26 @@ static struct xfrm_policy *clone_policy(const struct xfrm_policy *old, int dir)
return newp;
}
-int __xfrm_sk_clone_policy(struct sock *sk)
+int __xfrm_sk_clone_policy(struct sock *sk, const struct sock *osk)
{
- struct xfrm_policy *p0 = sk->sk_policy[0],
- *p1 = sk->sk_policy[1];
+ const struct xfrm_policy *p;
+ struct xfrm_policy *np;
+ int i, ret = 0;
- sk->sk_policy[0] = sk->sk_policy[1] = NULL;
- if (p0 && (sk->sk_policy[0] = clone_policy(p0, 0)) == NULL)
- return -ENOMEM;
- if (p1 && (sk->sk_policy[1] = clone_policy(p1, 1)) == NULL)
- return -ENOMEM;
- return 0;
+ rcu_read_lock();
+ for (i = 0; i < 2; i++) {
+ p = rcu_dereference(osk->sk_policy[i]);
+ if (p) {
+ np = clone_policy(p, i);
+ if (unlikely(!np)) {
+ ret = -ENOMEM;
+ break;
+ }
+ rcu_assign_pointer(sk->sk_policy[i], np);
+ }
+ }
+ rcu_read_unlock();
+ return ret;
}
static int
@@ -2198,6 +2218,7 @@ struct dst_entry *xfrm_lookup(struct net *net, struct dst_entry *dst_orig,
xdst = NULL;
route = NULL;
+ sk = sk_const_to_full_sk(sk);
if (sk && sk->sk_policy[XFRM_POLICY_OUT]) {
num_pols = 1;
pols[0] = xfrm_sk_policy_lookup(sk, XFRM_POLICY_OUT, fl);
@@ -2477,6 +2498,7 @@ int __xfrm_policy_check(struct sock *sk, int dir, struct sk_buff *skb,
}
pol = NULL;
+ sk = sk_to_full_sk(sk);
if (sk && sk->sk_policy[dir]) {
pol = xfrm_sk_policy_lookup(sk, dir, &fl);
if (IS_ERR(pol)) {
@@ -2804,7 +2826,6 @@ static struct neighbour *xfrm_neigh_lookup(const struct dst_entry *dst,
int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
{
- struct net *net;
int err = 0;
if (unlikely(afinfo == NULL))
return -EINVAL;
@@ -2835,26 +2856,6 @@ int xfrm_policy_register_afinfo(struct xfrm_policy_afinfo *afinfo)
}
spin_unlock(&xfrm_policy_afinfo_lock);
- rtnl_lock();
- for_each_net(net) {
- struct dst_ops *xfrm_dst_ops;
-
- switch (afinfo->family) {
- case AF_INET:
- xfrm_dst_ops = &net->xfrm.xfrm4_dst_ops;
- break;
-#if IS_ENABLED(CONFIG_IPV6)
- case AF_INET6:
- xfrm_dst_ops = &net->xfrm.xfrm6_dst_ops;
- break;
-#endif
- default:
- BUG();
- }
- *xfrm_dst_ops = *afinfo->dst_ops;
- }
- rtnl_unlock();
-
return err;
}
EXPORT_SYMBOL(xfrm_policy_register_afinfo);
@@ -2890,22 +2891,6 @@ int xfrm_policy_unregister_afinfo(struct xfrm_policy_afinfo *afinfo)
}
EXPORT_SYMBOL(xfrm_policy_unregister_afinfo);
-static void __net_init xfrm_dst_ops_init(struct net *net)
-{
- struct xfrm_policy_afinfo *afinfo;
-
- rcu_read_lock();
- afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET]);
- if (afinfo)
- net->xfrm.xfrm4_dst_ops = *afinfo->dst_ops;
-#if IS_ENABLED(CONFIG_IPV6)
- afinfo = rcu_dereference(xfrm_policy_afinfo[AF_INET6]);
- if (afinfo)
- net->xfrm.xfrm6_dst_ops = *afinfo->dst_ops;
-#endif
- rcu_read_unlock();
-}
-
static int xfrm_dev_event(struct notifier_block *this, unsigned long event, void *ptr)
{
struct net_device *dev = netdev_notifier_info_to_dev(ptr);
@@ -3054,7 +3039,6 @@ static int __net_init xfrm_net_init(struct net *net)
rv = xfrm_policy_init(net);
if (rv < 0)
goto out_policy;
- xfrm_dst_ops_init(net);
rv = xfrm_sysctl_init(net);
if (rv < 0)
goto out_sysctl;
diff --git a/net/xfrm/xfrm_user.c b/net/xfrm/xfrm_user.c
index 805681a7d356..2cc7af858c6f 100644
--- a/net/xfrm/xfrm_user.c
+++ b/net/xfrm/xfrm_user.c
@@ -2449,7 +2449,7 @@ static int xfrm_user_rcv_msg(struct sk_buff *skb, struct nlmsghdr *nlh)
int type, err;
#ifdef CONFIG_COMPAT
- if (is_compat_task())
+ if (in_compat_syscall())
return -ENOTSUPP;
#endif